lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # Needs 'no overloading' to run faster on miniperl.  Code commented out at the
   8 # subroutine objaddr can be used instead to work as far back (untested) as
   9 # 5.8: needs pack "U".  But almost all occurrences of objaddr have been
  10 # removed in favor of using 'no overloading'.  You also would have to go
  11 # through and replace occurrences like:
  12 #       my $addr = do { no overloading; pack 'J', $self; }
  13 # with
  14 #       my $addr = main::objaddr $self;
  15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
  16 # that instituted the change to main::objaddr, and subsequent commits that
  17 # changed 0+$self to pack 'J', $self.)
  18
  19 my $start_time;
  20 BEGIN { # Get the time the script started running; do it at compilation to
  21         # get it as close as possible
  22     $start_time= time;
  23 }
  24
  25 require 5.010_001;
  26 use strict;
  27 use warnings;
  28 use Carp;
  29 use Config;
  30 use File::Find;
  31 use File::Path;
  32 use File::Spec;
  33 use Text::Tabs;
  34 use re "/aa";
  35
  36 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  38
  39 sub NON_ASCII_PLATFORM { ord("A") != 65 }
  40
  41 ##########################################################################
  42 #
  43 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  44 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  45 # a pod file and .t files, depending on option parameters.
  46 #
  47 # The structure of this file is:
  48 #   First these introductory comments; then
  49 #   code needed for everywhere, such as debugging stuff; then
  50 #   code to handle input parameters; then
  51 #   data structures likely to be of external interest (some of which depend on
  52 #       the input parameters, so follows them; then
  53 #   more data structures and subroutine and package (class) definitions; then
  54 #   the small actual loop to process the input files and finish up; then
  55 #   a __DATA__ section, for the .t tests
  56 #
  57 # This program works on all releases of Unicode so far.  The outputs have been
  58 # scrutinized most intently for release 5.1.  The others have been checked for
  59 # somewhat more than just sanity.  It can handle all non-provisional Unicode
  60 # character properties in those releases.
  61 #
  62 # This program is mostly about Unicode character (or code point) properties.
  63 # A property describes some attribute or quality of a code point, like if it
  64 # is lowercase or not, its name, what version of Unicode it was first defined
  65 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  66 # possibilities by making all properties into mappings from each code point
  67 # into some corresponding value.  In the case of it being lowercase or not,
  68 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  69 # property maps each Unicode code point to a single value, called a "property
  70 # value".  (Some more recently defined properties, map a code point to a set
  71 # of values.)
  72 #
  73 # When using a property in a regular expression, what is desired isn't the
  74 # mapping of the code point to its property's value, but the reverse (or the
  75 # mathematical "inverse relation"): starting with the property value, "Does a
  76 # code point map to it?"  These are written in a "compound" form:
  77 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  78 # files containing the lists of code points that map to each such regular
  79 # expression property value, one file per list
  80 #
  81 # There is also a single form shortcut that Perl adds for many of the commonly
  82 # used properties.  This happens for all binary properties, plus script,
  83 # general_category, and block properties.
  84 #
  85 # Thus the outputs of this program are files.  There are map files, mostly in
  86 # the 'To' directory; and there are list files for use in regular expression
  87 # matching, all in subdirectories of the 'lib' directory, with each
  88 # subdirectory being named for the property that the lists in it are for.
  89 # Bookkeeping, test, and documentation files are also generated.
  90
  91 my $matches_directory = 'lib';   # Where match (\p{}) files go.
  92 my $map_directory = 'To';        # Where map files go.
  93
  94 # DATA STRUCTURES
  95 #
  96 # The major data structures of this program are Property, of course, but also
  97 # Table.  There are two kinds of tables, very similar to each other.
  98 # "Match_Table" is the data structure giving the list of code points that have
  99 # a particular property value, mentioned above.  There is also a "Map_Table"
 100 # data structure which gives the property's mapping from code point to value.
 101 # There are two structures because the match tables need to be combined in
 102 # various ways, such as constructing unions, intersections, complements, etc.,
 103 # and the map ones don't.  And there would be problems, perhaps subtle, if
 104 # a map table were inadvertently operated on in some of those ways.
 105 # The use of separate classes with operations defined on one but not the other
 106 # prevents accidentally confusing the two.
 107 #
 108 # At the heart of each table's data structure is a "Range_List", which is just
 109 # an ordered list of "Ranges", plus ancillary information, and methods to
 110 # operate on them.  A Range is a compact way to store property information.
 111 # Each range has a starting code point, an ending code point, and a value that
 112 # is meant to apply to all the code points between the two end points,
 113 # inclusive.  For a map table, this value is the property value for those
 114 # code points.  Two such ranges could be written like this:
 115 #   0x41 .. 0x5A, 'Upper',
 116 #   0x61 .. 0x7A, 'Lower'
 117 #
 118 # Each range also has a type used as a convenience to classify the values.
 119 # Most ranges in this program will be Type 0, or normal, but there are some
 120 # ranges that have a non-zero type.  These are used only in map tables, and
 121 # are for mappings that don't fit into the normal scheme of things.  Mappings
 122 # that require a hash entry to communicate with utf8.c are one example;
 123 # another example is mappings for charnames.pm to use which indicate a name
 124 # that is algorithmically determinable from its code point (and the reverse).
 125 # These are used to significantly compact these tables, instead of listing
 126 # each one of the tens of thousands individually.
 127 #
 128 # In a match table, the value of a range is irrelevant (and hence the type as
 129 # well, which will always be 0), and arbitrarily set to the null string.
 130 # Using the example above, there would be two match tables for those two
 131 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 132 # named Lower would contain 0x61..0x7A.
 133 #
 134 # Actually, there are two types of range lists, "Range_Map" is the one
 135 # associated with map tables, and "Range_List" with match tables.
 136 # Again, this is so that methods can be defined on one and not the others so
 137 # as to prevent operating on them in incorrect ways.
 138 #
 139 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 140 # in the perl core.  All tables could in theory be written, but some are
 141 # suppressed because there is no current practical use for them.  It is easy
 142 # to change which get written by changing various lists that are near the top
 143 # of the actual code in this file.  The table data structures contain enough
 144 # ancillary information to allow them to be treated as separate entities for
 145 # writing, such as the path to each one's file.  There is a heading in each
 146 # map table that gives the format of its entries, and what the map is for all
 147 # the code points missing from it.  (This allows tables to be more compact.)
 148 #
 149 # The Property data structure contains one or more tables.  All properties
 150 # contain a map table (except the $perl property which is a
 151 # pseudo-property containing only match tables), and any properties that
 152 # are usable in regular expression matches also contain various matching
 153 # tables, one for each value the property can have.  A binary property can
 154 # have two values, True and False (or Y and N, which are preferred by Unicode
 155 # terminology).  Thus each of these properties will have a map table that
 156 # takes every code point and maps it to Y or N (but having ranges cuts the
 157 # number of entries in that table way down), and two match tables, one
 158 # which has a list of all the code points that map to Y, and one for all the
 159 # code points that map to N.  (For each binary property, a third table is also
 160 # generated for the pseudo Perl property.  It contains the identical code
 161 # points as the Y table, but can be written in regular expressions, not in the
 162 # compound form, but in a "single" form like \p{IsUppercase}.)  Many
 163 # properties are binary, but some properties have several possible values,
 164 # some have many, and properties like Name have a different value for every
 165 # named code point.  Those will not, unless the controlling lists are changed,
 166 # have their match tables written out.  But all the ones which can be used in
 167 # regular expression \p{} and \P{} constructs will.  Prior to 5.14, generally
 168 # a property would have either its map table or its match tables written but
 169 # not both.  Again, what gets written is controlled by lists which can easily
 170 # be changed.  Starting in 5.14, advantage was taken of this, and all the map
 171 # tables needed to reconstruct the Unicode db are now written out, while
 172 # suppressing the Unicode .txt files that contain the data.  Our tables are
 173 # much more compact than the .txt files, so a significant space savings was
 174 # achieved.  Also, tables are not written out that are trivially derivable
 175 # from tables that do get written.  So, there typically is no file containing
 176 # the code points not matched by a binary property (the table for \P{} versus
 177 # lowercase \p{}), since you just need to invert the True table to get the
 178 # False table.
 179
 180 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
 181 # how many match tables there are and the content of the maps.  This 'Type' is
 182 # different than a range 'Type', so don't get confused by the two concepts
 183 # having the same name.
 184 #
 185 # For information about the Unicode properties, see Unicode's UAX44 document:
 186
 187 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 188
 189 # As stated earlier, this program will work on any release of Unicode so far.
 190 # Most obvious problems in earlier data have NOT been corrected except when
 191 # necessary to make Perl or this program work reasonably, and to keep out
 192 # potential security issues.  For example, no folding information was given in
 193 # early releases, so this program substitutes lower case instead, just so that
 194 # a regular expression with the /i option will do something that actually
 195 # gives the right results in many cases.  There are also a couple other
 196 # corrections for version 1.1.5, commented at the point they are made.  As an
 197 # example of corrections that weren't made (but could be) is this statement
 198 # from DerivedAge.txt: "The supplementary private use code points and the
 199 # non-character code points were assigned in version 2.0, but not specifically
 200 # listed in the UCD until versions 3.0 and 3.1 respectively."  (To be precise
 201 # it was 3.0.1 not 3.0.0)  More information on Unicode version glitches is
 202 # further down in these introductory comments.
 203 #
 204 # This program works on all non-provisional properties as of the current
 205 # Unicode release, though the files for some are suppressed for various
 206 # reasons.  You can change which are output by changing lists in this program.
 207 #
 208 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
 209 # loose matchings rules (from Unicode TR18):
 210 #
 211 #    The recommended names for UCD properties and property values are in
 212 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 213 #    [PropValue]. There are both abbreviated names and longer, more
 214 #    descriptive names. It is strongly recommended that both names be
 215 #    recognized, and that loose matching of property names be used,
 216 #    whereby the case distinctions, whitespace, hyphens, and underbar
 217 #    are ignored.
 218 #
 219 # The program still allows Fuzzy to override its determination of if loose
 220 # matching should be used, but it isn't currently used, as it is no longer
 221 # needed; the calculations it makes are good enough.
 222 #
 223 # SUMMARY OF HOW IT WORKS:
 224 #
 225 #   Process arguments
 226 #
 227 #   A list is constructed containing each input file that is to be processed
 228 #
 229 #   Each file on the list is processed in a loop, using the associated handler
 230 #   code for each:
 231 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 232 #            first.  These files name the properties and property values.
 233 #            Objects are created of all the property and property value names
 234 #            that the rest of the input should expect, including all synonyms.
 235 #        The other input files give mappings from properties to property
 236 #           values.  That is, they list code points and say what the mapping
 237 #           is under the given property.  Some files give the mappings for
 238 #           just one property; and some for many.  This program goes through
 239 #           each file and populates the properties and their map tables from
 240 #           them.  Some properties are listed in more than one file, and
 241 #           Unicode has set up a precedence as to which has priority if there
 242 #           is a conflict.  Thus the order of processing matters, and this
 243 #           program handles the conflict possibility by processing the
 244 #           overriding input files last, so that if necessary they replace
 245 #           earlier values.
 246 #        After this is all done, the program creates the property mappings not
 247 #            furnished by Unicode, but derivable from what it does give.
 248 #        The tables of code points that match each property value in each
 249 #            property that is accessible by regular expressions are created.
 250 #        The Perl-defined properties are created and populated.  Many of these
 251 #            require data determined from the earlier steps
 252 #        Any Perl-defined synonyms are created, and name clashes between Perl
 253 #            and Unicode are reconciled and warned about.
 254 #        All the properties are written to files
 255 #        Any other files are written, and final warnings issued.
 256 #
 257 # For clarity, a number of operators have been overloaded to work on tables:
 258 #   ~ means invert (take all characters not in the set).  The more
 259 #       conventional '!' is not used because of the possibility of confusing
 260 #       it with the actual boolean operation.
 261 #   + means union
 262 #   - means subtraction
 263 #   & means intersection
 264 # The precedence of these is the order listed.  Parentheses should be
 265 # copiously used.  These are not a general scheme.  The operations aren't
 266 # defined for a number of things, deliberately, to avoid getting into trouble.
 267 # Operations are done on references and affect the underlying structures, so
 268 # that the copy constructors for them have been overloaded to not return a new
 269 # clone, but the input object itself.
 270 #
 271 # The bool operator is deliberately not overloaded to avoid confusion with
 272 # "should it mean if the object merely exists, or also is non-empty?".
 273 #
 274 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 275 #
 276 # This program needs to be able to run under miniperl.  Therefore, it uses a
 277 # minimum of other modules, and hence implements some things itself that could
 278 # be gotten from CPAN
 279 #
 280 # This program uses inputs published by the Unicode Consortium.  These can
 281 # change incompatibly between releases without the Perl maintainers realizing
 282 # it.  Therefore this program is now designed to try to flag these.  It looks
 283 # at the directories where the inputs are, and flags any unrecognized files.
 284 # It keeps track of all the properties in the files it handles, and flags any
 285 # that it doesn't know how to handle.  It also flags any input lines that
 286 # don't match the expected syntax, among other checks.
 287 #
 288 # It is also designed so if a new input file matches one of the known
 289 # templates, one hopefully just needs to add it to a list to have it
 290 # processed.
 291 #
 292 # As mentioned earlier, some properties are given in more than one file.  In
 293 # particular, the files in the extracted directory are supposedly just
 294 # reformattings of the others.  But they contain information not easily
 295 # derivable from the other files, including results for Unihan, which this
 296 # program doesn't ordinarily look at, and for unassigned code points.  They
 297 # also have historically had errors or been incomplete.  In an attempt to
 298 # create the best possible data, this program thus processes them first to
 299 # glean information missing from the other files; then processes those other
 300 # files to override any errors in the extracted ones.  Much of the design was
 301 # driven by this need to store things and then possibly override them.
 302 #
 303 # It tries to keep fatal errors to a minimum, to generate something usable for
 304 # testing purposes.  It always looks for files that could be inputs, and will
 305 # warn about any that it doesn't know how to handle (the -q option suppresses
 306 # the warning).
 307 #
 308 # Why is there more than one type of range?
 309 #   This simplified things.  There are some very specialized code points that
 310 #   have to be handled specially for output, such as Hangul syllable names.
 311 #   By creating a range type (done late in the development process), it
 312 #   allowed this to be stored with the range, and overridden by other input.
 313 #   Originally these were stored in another data structure, and it became a
 314 #   mess trying to decide if a second file that was for the same property was
 315 #   overriding the earlier one or not.
 316 #
 317 # Why are there two kinds of tables, match and map?
 318 #   (And there is a base class shared by the two as well.)  As stated above,
 319 #   they actually are for different things.  Development proceeded much more
 320 #   smoothly when I (khw) realized the distinction.  Map tables are used to
 321 #   give the property value for every code point (actually every code point
 322 #   that doesn't map to a default value).  Match tables are used for regular
 323 #   expression matches, and are essentially the inverse mapping.  Separating
 324 #   the two allows more specialized methods, and error checks so that one
 325 #   can't just take the intersection of two map tables, for example, as that
 326 #   is nonsensical.
 327 #
 328 # What about 'fate' and 'status'.  The concept of a table's fate was created
 329 #   late when it became clear that something more was needed.  The difference
 330 #   between this and 'status' is unclean, and could be improved if someone
 331 #   wanted to spend the effort.
 332 #
 333 # DEBUGGING
 334 #
 335 # This program is written so it will run under miniperl.  Occasionally changes
 336 # will cause an error where the backtrace doesn't work well under miniperl.
 337 # To diagnose the problem, you can instead run it under regular perl, if you
 338 # have one compiled.
 339 #
 340 # There is a good trace facility.  To enable it, first sub DEBUG must be set
 341 # to return true.  Then a line like
 342 #
 343 # local $to_trace = 1 if main::DEBUG;
 344 #
 345 # can be added to enable tracing in its lexical scope (plus dynamic) or until
 346 # you insert another line:
 347 #
 348 # local $to_trace = 0 if main::DEBUG;
 349 #
 350 # To actually trace, use a line like "trace $a, @b, %c, ...;
 351 #
 352 # Some of the more complex subroutines already have trace statements in them.
 353 # Permanent trace statements should be like:
 354 #
 355 # trace ... if main::DEBUG && $to_trace;
 356 #
 357 # If there is just one or a few files that you're debugging, you can easily
 358 # cause most everything else to be skipped.  Change the line
 359 #
 360 # my $debug_skip = 0;
 361 #
 362 # to 1, and every file whose object is in @input_file_objects and doesn't have
 363 # a, 'non_skip => 1,' in its constructor will be skipped.  However, skipping
 364 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
 365 #
 366 # To compare the output tables, it may be useful to specify the -annotate
 367 # flag.  (As of this writing, this can't be done on a clean workspace, due to
 368 # requirements in Text::Tabs used in this option; so first run mktables
 369 # without this option.)  This option adds comment lines to each table, one for
 370 # each non-algorithmically named character giving, currently its code point,
 371 # name, and graphic representation if printable (and you have a font that
 372 # knows about it).  This makes it easier to see what the particular code
 373 # points are in each output table.  Non-named code points are annotated with a
 374 # description of their status, and contiguous ones with the same description
 375 # will be output as a range rather than individually.  Algorithmically named
 376 # characters are also output as ranges, except when there are just a few
 377 # contiguous ones.
 378 #
 379 # FUTURE ISSUES
 380 #
 381 # The program would break if Unicode were to change its names so that
 382 # interior white space, underscores, or dashes differences were significant
 383 # within property and property value names.
 384 #
 385 # It might be easier to use the xml versions of the UCD if this program ever
 386 # would need heavy revision, and the ability to handle old versions was not
 387 # required.
 388 #
 389 # There is the potential for name collisions, in that Perl has chosen names
 390 # that Unicode could decide it also likes.  There have been such collisions in
 391 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 392 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 393 # collisions, which were withdrawn before the final release, because of Perl's
 394 # and other's protests.  These all involved new properties which began with
 395 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 396 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 397 # Unicode document, so they are unlikely to be used by Unicode for another
 398 # purpose.  However, they might try something beginning with 'In', or use any
 399 # of the other Perl-defined properties.  This program will warn you of name
 400 # collisions, and refuse to generate tables with them, but manual intervention
 401 # will be required in this event.  One scheme that could be implemented, if
 402 # necessary, would be to have this program generate another file, or add a
 403 # field to mktables.lst that gives the date of first definition of a property.
 404 # Each new release of Unicode would use that file as a basis for the next
 405 # iteration.  And the Perl synonym addition code could sort based on the age
 406 # of the property, so older properties get priority, and newer ones that clash
 407 # would be refused; hence existing code would not be impacted, and some other
 408 # synonym would have to be used for the new property.  This is ugly, and
 409 # manual intervention would certainly be easier to do in the short run; lets
 410 # hope it never comes to this.
 411 #
 412 # A NOTE ON UNIHAN
 413 #
 414 # This program can generate tables from the Unihan database.  But it doesn't
 415 # by default, letting the CPAN module Unicode::Unihan handle them.  Prior to
 416 # version 5.2, this database was in a single file, Unihan.txt.  In 5.2 the
 417 # database was split into 8 different files, all beginning with the letters
 418 # 'Unihan'.  This program will read those file(s) if present, but it needs to
 419 # know which of the many properties in the file(s) should have tables created
 420 # for them.  It will create tables for any properties listed in
 421 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
 422 # @cjk_properties array and the @cjk_property_values array.  Thus, if a
 423 # property you want is not in those files of the release you are building
 424 # against, you must add it to those two arrays.  Starting in 4.0, the
 425 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
 426 # is present in the directory, a table will be generated for that property.
 427 # In 5.2, several more properties were added.  For your convenience, the two
 428 # arrays are initialized with all the 6.0 listed properties that are also in
 429 # earlier releases.  But these are commented out.  You can just uncomment the
 430 # ones you want, or use them as a template for adding entries for other
 431 # properties.
 432 #
 433 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 434 # and filter_unihan_line() are the functions where this is done.  This program
 435 # already does some adjusting to make the lines look more like the rest of the
 436 # Unicode DB;  You can see what that is in filter_unihan_line()
 437 #
 438 # There is a bug in the 3.2 data file in which some values for the
 439 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 440 # could be added for these; or for a particular installation, the Unihan.txt
 441 # file could be edited to fix them.
 442 #
 443 # HOW TO ADD A FILE TO BE PROCESSED
 444 #
 445 # A new file from Unicode needs to have an object constructed for it in
 446 # @input_file_objects, probably at the end or at the end of the extracted
 447 # ones.  The program should warn you if its name will clash with others on
 448 # restrictive file systems, like DOS.  If so, figure out a better name, and
 449 # add lines to the README.perl file giving that.  If the file is a character
 450 # property, it should be in the format that Unicode has implicitly
 451 # standardized for such files for the more recently introduced ones.
 452 # If so, the Input_file constructor for @input_file_objects can just be the
 453 # file name and release it first appeared in.  If not, then it should be
 454 # possible to construct an each_line_handler() to massage the line into the
 455 # standardized form.
 456 #
 457 # For non-character properties, more code will be needed.  You can look at
 458 # the existing entries for clues.
 459 #
 460 # UNICODE VERSIONS NOTES
 461 #
 462 # The Unicode UCD has had a number of errors in it over the versions.  And
 463 # these remain, by policy, in the standard for that version.  Therefore it is
 464 # risky to correct them, because code may be expecting the error.  So this
 465 # program doesn't generally make changes, unless the error breaks the Perl
 466 # core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
 467 # for U+1105, which causes real problems for the algorithms for Jamo
 468 # calculations, so it is changed here.
 469 #
 470 # But it isn't so clear cut as to what to do about concepts that are
 471 # introduced in a later release; should they extend back to earlier releases
 472 # where the concept just didn't exist?  It was easier to do this than to not,
 473 # so that's what was done.  For example, the default value for code points not
 474 # in the files for various properties was probably undefined until changed by
 475 # some version.  No_Block for blocks is such an example.  This program will
 476 # assign No_Block even in Unicode versions that didn't have it.  This has the
 477 # benefit that code being written doesn't have to special case earlier
 478 # versions; and the detriment that it doesn't match the Standard precisely for
 479 # the affected versions.
 480 #
 481 # Here are some observations about some of the issues in early versions:
 482 #
 483 # Prior to version 3.0, there were 3 character decompositions.  These are not
 484 # handled by Unicode::Normalize, nor will it compile when presented a version
 485 # that has them.  However, you can trivially get it to compile by simply
 486 # ignoring those decompositions, by changing the croak to a carp.  At the time
 487 # of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
 488 #
 489 #   croak("Weird Canonical Decomposition of U+$h");
 490 #
 491 # Simply change to a carp.  It will compile, but will not know about any three
 492 # character decomposition.
 493
 494 # The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
 495 # that the reason is that the CJK block starting at 4E00 was removed from
 496 # PropList, and was not put back in until 3.1.0.  The Perl extension (the
 497 # single property name \p{alpha}) has the correct values.  But the compound
 498 # form is simply not generated until 3.1, as it can be argued that prior to
 499 # this release, this was not an official property.  The comments for
 500 # filter_old_style_proplist() give more details.
 501 #
 502 # Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
 503 # always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
 504 # reason is that 3.2 introduced U+205F=medium math space, which was not
 505 # classed as white space, but Perl figured out that it should have been. 4.0
 506 # reclassified it correctly.
 507 #
 508 # Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
 509 # this was erroneously a synonym for 202 (it should be 200).  In 4.0, ATB
 510 # became 202, and ATBL was left with no code points, as all the ones that
 511 # mapped to 202 stayed mapped to 202.  Thus if your program used the numeric
 512 # name for the class, it would not have been affected, but if it used the
 513 # mnemonic, it would have been.
 514 #
 515 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that code
 516 # points which eventually came to have this script property value, instead
 517 # mapped to "Unknown".  But in the next release all these code points were
 518 # moved to \p{sc=common} instead.
 519 #
 520 # The default for missing code points for BidiClass is complicated.  Starting
 521 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 522 # tries to do the best it can for earlier releases.  It is done in
 523 # process_PropertyAliases()
 524 #
 525 # In version 2.1.2, the entry in UnicodeData.txt:
 526 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
 527 # should instead be
 528 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
 529 # Without this change, there are casing problems for this character.
 530 #
 531 ##############################################################################
 532
 533 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 534                         # and errors
 535 my $MAX_LINE_WIDTH = 78;
 536
 537 # Debugging aid to skip most files so as to not be distracted by them when
 538 # concentrating on the ones being debugged.  Add
 539 # non_skip => 1,
 540 # to the constructor for those files you want processed when you set this.
 541 # Files with a first version number of 0 are special: they are always
 542 # processed regardless of the state of this flag.  Generally, Jamo.txt and
 543 # UnicodeData.txt must not be skipped if you want this program to not die
 544 # before normal completion.
 545 my $debug_skip = 0;
 546
 547
 548 # Normally these are suppressed.
 549 my $write_Unicode_deprecated_tables = 0;
 550
 551 # Set to 1 to enable tracing.
 552 our $to_trace = 0;
 553
 554 { # Closure for trace: debugging aid
 555     my $print_caller = 1;        # ? Include calling subroutine name
 556     my $main_with_colon = 'main::';
 557     my $main_colon_length = length($main_with_colon);
 558
 559     sub trace {
 560         return unless $to_trace;        # Do nothing if global flag not set
 561
 562         my @input = @_;
 563
 564         local $DB::trace = 0;
 565         $DB::trace = 0;          # Quiet 'used only once' message
 566
 567         my $line_number;
 568
 569         # Loop looking up the stack to get the first non-trace caller
 570         my $caller_line;
 571         my $caller_name;
 572         my $i = 0;
 573         do {
 574             $line_number = $caller_line;
 575             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 576             $caller = $main_with_colon unless defined $caller;
 577
 578             $caller_name = $caller;
 579
 580             # get rid of pkg
 581             $caller_name =~ s/.*:://;
 582             if (substr($caller_name, 0, $main_colon_length)
 583                 eq $main_with_colon)
 584             {
 585                 $caller_name = substr($caller_name, $main_colon_length);
 586             }
 587
 588         } until ($caller_name ne 'trace');
 589
 590         # If the stack was empty, we were called from the top level
 591         $caller_name = 'main' if ($caller_name eq ""
 592                                     || $caller_name eq 'trace');
 593
 594         my $output = "";
 595         foreach my $string (@input) {
 596             #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 597             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 598                 $output .= simple_dumper($string);
 599             }
 600             else {
 601                 $string = "$string" if ref $string;
 602                 $string = $UNDEF unless defined $string;
 603                 chomp $string;
 604                 $string = '""' if $string eq "";
 605                 $output .= " " if $output ne ""
 606                                 && $string ne ""
 607                                 && substr($output, -1, 1) ne " "
 608                                 && substr($string, 0, 1) ne " ";
 609                 $output .= $string;
 610             }
 611         }
 612
 613         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 614         print STDERR "$caller_name: " if $print_caller;
 615         print STDERR $output, "\n";
 616         return;
 617     }
 618 }
 619
 620 # This is for a rarely used development feature that allows you to compare two
 621 # versions of the Unicode standard without having to deal with changes caused
 622 # by the code points introduced in the later version.  Change the 0 to a
 623 # string containing a SINGLE dotted Unicode release number (e.g. "2.1").  Only
 624 # code points introduced in that release and earlier will be used; later ones
 625 # are thrown away.  You use the version number of the earliest one you want to
 626 # compare; then run this program on directory structures containing each
 627 # release, and compare the outputs.  These outputs will therefore include only
 628 # the code points common to both releases, and you can see the changes caused
 629 # just by the underlying release semantic changes.  For versions earlier than
 630 # 3.2, you must copy a version of DAge.txt into the directory.
 631 my $string_compare_versions = DEBUG && 0; #  e.g., "2.1";
 632 my $compare_versions = DEBUG
 633                        && $string_compare_versions
 634                        && pack "C*", split /\./, $string_compare_versions;
 635
 636 sub uniques {
 637     # Returns non-duplicated input values.  From "Perl Best Practices:
 638     # Encapsulated Cleverness".  p. 455 in first edition.
 639
 640     my %seen;
 641     # Arguably this breaks encapsulation, if the goal is to permit multiple
 642     # distinct objects to stringify to the same value, and be interchangeable.
 643     # However, for this program, no two objects stringify identically, and all
 644     # lists passed to this function are either objects or strings. So this
 645     # doesn't affect correctness, but it does give a couple of percent speedup.
 646     no overloading;
 647     return grep { ! $seen{$_}++ } @_;
 648 }
 649
 650 $0 = File::Spec->canonpath($0);
 651
 652 my $make_test_script = 0;      # ? Should we output a test script
 653 my $make_norm_test_script = 0; # ? Should we output a normalization test script
 654 my $write_unchanged_files = 0; # ? Should we update the output files even if
 655                                #    we don't think they have changed
 656 my $use_directory = "";        # ? Should we chdir somewhere.
 657 my $pod_directory;             # input directory to store the pod file.
 658 my $pod_file = 'perluniprops';
 659 my $t_path;                     # Path to the .t test file
 660 my $file_list = 'mktables.lst'; # File to store input and output file names.
 661                                # This is used to speed up the build, by not
 662                                # executing the main body of the program if
 663                                # nothing on the list has changed since the
 664                                # previous build
 665 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 666                                # make a list so that when the pumpking is
 667                                # preparing a release, s/he won't have to do
 668                                # special things
 669 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 670                                # in the input.
 671 my $output_range_counts = $debugging_build;   # ? Should we include the number
 672                                               # of code points in ranges in
 673                                               # the output
 674 my $annotate = 0;              # ? Should character names be in the output
 675
 676 # Verbosity levels; 0 is quiet
 677 my $NORMAL_VERBOSITY = 1;
 678 my $PROGRESS = 2;
 679 my $VERBOSE = 3;
 680
 681 my $verbosity = $NORMAL_VERBOSITY;
 682
 683 # Stored in mktables.lst so that if this program is called with different
 684 # options, will regenerate even if the files otherwise look like they're
 685 # up-to-date.
 686 my $command_line_arguments = join " ", @ARGV;
 687
 688 # Process arguments
 689 while (@ARGV) {
 690     my $arg = shift @ARGV;
 691     if ($arg eq '-v') {
 692         $verbosity = $VERBOSE;
 693     }
 694     elsif ($arg eq '-p') {
 695         $verbosity = $PROGRESS;
 696         $| = 1;     # Flush buffers as we go.
 697     }
 698     elsif ($arg eq '-q') {
 699         $verbosity = 0;
 700     }
 701     elsif ($arg eq '-w') {
 702         $write_unchanged_files = 1; # update the files even if havent changed
 703     }
 704     elsif ($arg eq '-check') {
 705         my $this = shift @ARGV;
 706         my $ok = shift @ARGV;
 707         if ($this ne $ok) {
 708             print "Skipping as check params are not the same.\n";
 709             exit(0);
 710         }
 711     }
 712     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 713         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 714     }
 715     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 716     {
 717         $make_test_script = 1;
 718     }
 719     elsif ($arg eq '-makenormtest')
 720     {
 721         $make_norm_test_script = 1;
 722     }
 723     elsif ($arg eq '-makelist') {
 724         $make_list = 1;
 725     }
 726     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 727         -d $use_directory or croak "Unknown directory '$use_directory'";
 728     }
 729     elsif ($arg eq '-L') {
 730
 731         # Existence not tested until have chdir'd
 732         $file_list = shift;
 733     }
 734     elsif ($arg eq '-globlist') {
 735         $glob_list = 1;
 736     }
 737     elsif ($arg eq '-c') {
 738         $output_range_counts = ! $output_range_counts
 739     }
 740     elsif ($arg eq '-annotate') {
 741         $annotate = 1;
 742         $debugging_build = 1;
 743         $output_range_counts = 1;
 744     }
 745     else {
 746         my $with_c = 'with';
 747         $with_c .= 'out' if $output_range_counts;   # Complements the state
 748         croak <<END;
 749 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 750           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 751           [-check A B ]
 752   -c          : Output comments $with_c number of code points in ranges
 753   -q          : Quiet Mode: Only output serious warnings.
 754   -p          : Set verbosity level to normal plus show progress.
 755   -v          : Set Verbosity level high:  Show progress and non-serious
 756                 warnings
 757   -w          : Write files regardless
 758   -C dir      : Change to this directory before proceeding. All relative paths
 759                 except those specified by the -P and -T options will be done
 760                 with respect to this directory.
 761   -P dir      : Output $pod_file file to directory 'dir'.
 762   -T path     : Create a test script as 'path'; overrides -maketest
 763   -L filelist : Use alternate 'filelist' instead of standard one
 764   -globlist   : Take as input all non-Test *.txt files in current and sub
 765                 directories
 766   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 767                 overrides -T
 768   -makelist   : Rewrite the file list $file_list based on current setup
 769   -annotate   : Output an annotation for each character in the table files;
 770                 useful for debugging mktables, looking at diffs; but is slow
 771                 and memory intensive
 772   -check A B  : Executes $0 only if A and B are the same
 773 END
 774     }
 775 }
 776
 777 # Stores the most-recently changed file.  If none have changed, can skip the
 778 # build
 779 my $most_recent = (stat $0)[9];   # Do this before the chdir!
 780
 781 # Change directories now, because need to read 'version' early.
 782 if ($use_directory) {
 783     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 784         $pod_directory = File::Spec->rel2abs($pod_directory);
 785     }
 786     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 787         $t_path = File::Spec->rel2abs($t_path);
 788     }
 789     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 790     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 791         $pod_directory = File::Spec->abs2rel($pod_directory);
 792     }
 793     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 794         $t_path = File::Spec->abs2rel($t_path);
 795     }
 796 }
 797
 798 # Get Unicode version into regular and v-string.  This is done now because
 799 # various tables below get populated based on it.  These tables are populated
 800 # here to be near the top of the file, and so easily seeable by those needing
 801 # to modify things.
 802 open my $VERSION, "<", "version"
 803                     or croak "$0: can't open required file 'version': $!\n";
 804 my $string_version = <$VERSION>;
 805 close $VERSION;
 806 chomp $string_version;
 807 my $v_version = pack "C*", split /\./, $string_version;        # v string
 808
 809 # The following are the complete names of properties with property values that
 810 # are known to not match any code points in some versions of Unicode, but that
 811 # may change in the future so they should be matchable, hence an empty file is
 812 # generated for them.
 813 my @tables_that_may_be_empty;
 814 push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
 815                                                     if $v_version lt v6.3.0;
 816 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 817 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 818 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 819                                                     if $v_version ge v4.1.0;
 820 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
 821                                                     if $v_version ge v6.0.0;
 822 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
 823                                                     if $v_version ge v6.1.0;
 824 push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
 825                                                     if $v_version ge v6.2.0;
 826
 827 # The lists below are hashes, so the key is the item in the list, and the
 828 # value is the reason why it is in the list.  This makes generation of
 829 # documentation easier.
 830
 831 my %why_suppressed;  # No file generated for these.
 832
 833 # Files aren't generated for empty extraneous properties.  This is arguable.
 834 # Extraneous properties generally come about because a property is no longer
 835 # used in a newer version of Unicode.  If we generated a file without code
 836 # points, programs that used to work on that property will still execute
 837 # without errors.  It just won't ever match (or will always match, with \P{}).
 838 # This means that the logic is now likely wrong.  I (khw) think its better to
 839 # find this out by getting an error message.  Just move them to the table
 840 # above to change this behavior
 841 my %why_suppress_if_empty_warn_if_not = (
 842
 843    # It is the only property that has ever officially been removed from the
 844    # Standard.  The database never contained any code points for it.
 845    'Special_Case_Condition' => 'Obsolete',
 846
 847    # Apparently never official, but there were code points in some versions of
 848    # old-style PropList.txt
 849    'Non_Break' => 'Obsolete',
 850 );
 851
 852 # These would normally go in the warn table just above, but they were changed
 853 # a long time before this program was written, so warnings about them are
 854 # moot.
 855 if ($v_version gt v3.2.0) {
 856     push @tables_that_may_be_empty,
 857                                 'Canonical_Combining_Class=Attached_Below_Left'
 858 }
 859
 860 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
 861 # unless explicitly added.
 862 if ($v_version ge v5.2.0) {
 863     my $unihan = 'Unihan; remove from list if using Unihan';
 864     foreach my $table (qw (
 865                            kAccountingNumeric
 866                            kOtherNumeric
 867                            kPrimaryNumeric
 868                            kCompatibilityVariant
 869                            kIICore
 870                            kIRG_GSource
 871                            kIRG_HSource
 872                            kIRG_JSource
 873                            kIRG_KPSource
 874                            kIRG_MSource
 875                            kIRG_KSource
 876                            kIRG_TSource
 877                            kIRG_USource
 878                            kIRG_VSource
 879                            kRSUnicode
 880                         ))
 881     {
 882         $why_suppress_if_empty_warn_if_not{$table} = $unihan;
 883     }
 884 }
 885
 886 # Enum values for to_output_map() method in the Map_Table package.
 887 my $EXTERNAL_MAP = 1;
 888 my $INTERNAL_MAP = 2;
 889 my $OUTPUT_ADJUSTED = 3;
 890
 891 # To override computed values for writing the map tables for these properties.
 892 # The default for enum map tables is to write them out, so that the Unicode
 893 # .txt files can be removed, but all the data to compute any property value
 894 # for any code point is available in a more compact form.
 895 my %global_to_output_map = (
 896     # Needed by UCD.pm, but don't want to publicize that it exists, so won't
 897     # get stuck supporting it if things change.  Since it is a STRING
 898     # property, it normally would be listed in the pod, but INTERNAL_MAP
 899     # suppresses that.
 900     Unicode_1_Name => $INTERNAL_MAP,
 901
 902     Present_In => 0,                # Suppress, as easily computed from Age
 903     Block => (NON_ASCII_PLATFORM) ? 1 : 0,  # Suppress, as Blocks.txt is
 904                                             # retained, but needed for
 905                                             # non-ASCII
 906
 907     # Suppress, as mapping can be found instead from the
 908     # Perl_Decomposition_Mapping file
 909     Decomposition_Type => 0,
 910 );
 911
 912 # Properties that this program ignores.
 913 my @unimplemented_properties;
 914
 915 # With this release, it is automatically handled if the Unihan db is
 916 # downloaded
 917 push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
 918
 919 # There are several types of obsolete properties defined by Unicode.  These
 920 # must be hand-edited for every new Unicode release.
 921 my %why_deprecated;  # Generates a deprecated warning message if used.
 922 my %why_stabilized;  # Documentation only
 923 my %why_obsolete;    # Documentation only
 924
 925 {   # Closure
 926     my $simple = 'Perl uses the more complete version';
 927     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 928
 929     my $other_properties = 'other properties';
 930     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 931     my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
 932
 933     %why_deprecated = (
 934         'Grapheme_Link' => 'Deprecated by Unicode:  Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
 935         'Jamo_Short_Name' => $contributory,
 936         'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 937         'Other_Alphabetic' => $contributory,
 938         'Other_Default_Ignorable_Code_Point' => $contributory,
 939         'Other_Grapheme_Extend' => $contributory,
 940         'Other_ID_Continue' => $contributory,
 941         'Other_ID_Start' => $contributory,
 942         'Other_Lowercase' => $contributory,
 943         'Other_Math' => $contributory,
 944         'Other_Uppercase' => $contributory,
 945         'Expands_On_NFC' => $why_no_expand,
 946         'Expands_On_NFD' => $why_no_expand,
 947         'Expands_On_NFKC' => $why_no_expand,
 948         'Expands_On_NFKD' => $why_no_expand,
 949     );
 950
 951     %why_suppressed = (
 952         # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
 953         # contains the same information, but without the algorithmically
 954         # determinable Hangul syllables'.  This file is not published, so it's
 955         # existence is not noted in the comment.
 956         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
 957
 958         'Indic_Matra_Category' => "Provisional",
 959         'Indic_Syllabic_Category' => "Provisional",
 960
 961         # Don't suppress ISO_Comment, as otherwise special handling is needed
 962         # to differentiate between it and gc=c, which can be written as 'isc',
 963         # which is the same characters as ISO_Comment's short name.
 964
 965         'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
 966
 967         'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
 968         'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 969         'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 970         'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 971
 972         FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
 973     );
 974
 975     foreach my $property (
 976
 977             # The following are suppressed because they were made contributory
 978             # or deprecated by Unicode before Perl ever thought about
 979             # supporting them.
 980             'Jamo_Short_Name',
 981             'Grapheme_Link',
 982             'Expands_On_NFC',
 983             'Expands_On_NFD',
 984             'Expands_On_NFKC',
 985             'Expands_On_NFKD',
 986
 987             # The following are suppressed because they have been marked
 988             # as deprecated for a sufficient amount of time
 989             'Other_Alphabetic',
 990             'Other_Default_Ignorable_Code_Point',
 991             'Other_Grapheme_Extend',
 992             'Other_ID_Continue',
 993             'Other_ID_Start',
 994             'Other_Lowercase',
 995             'Other_Math',
 996             'Other_Uppercase',
 997     ) {
 998         $why_suppressed{$property} = $why_deprecated{$property};
 999     }
1000
1001     # Customize the message for all the 'Other_' properties
1002     foreach my $property (keys %why_deprecated) {
1003         next if (my $main_property = $property) !~ s/^Other_//;
1004         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1005     }
1006 }
1007
1008 if ($write_Unicode_deprecated_tables) {
1009     foreach my $property (keys %why_suppressed) {
1010         delete $why_suppressed{$property} if $property =~
1011                                                     / ^ Other | Grapheme /x;
1012     }
1013 }
1014
1015 if ($v_version ge 4.0.0) {
1016     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1017     if ($v_version ge 6.0.0) {
1018         $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1019     }
1020 }
1021 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1022     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1023     if ($v_version ge 6.0.0) {
1024         $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1025     }
1026 }
1027
1028 # Probably obsolete forever
1029 if ($v_version ge v4.1.0) {
1030     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
1031 }
1032 if ($v_version ge v6.0.0) {
1033     $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1034     $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1035 }
1036
1037 # This program can create files for enumerated-like properties, such as
1038 # 'Numeric_Type'.  This file would be the same format as for a string
1039 # property, with a mapping from code point to its value, so you could look up,
1040 # for example, the script a code point is in.  But no one so far wants this
1041 # mapping, or they have found another way to get it since this is a new
1042 # feature.  So no file is generated except if it is in this list.
1043 my @output_mapped_properties = split "\n", <<END;
1044 END
1045
1046 # If you are using the Unihan database in a Unicode version before 5.2, you
1047 # need to add the properties that you want to extract from it to this table.
1048 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
1049 # listed, commented out
1050 my @cjk_properties = split "\n", <<'END';
1051 #cjkAccountingNumeric; kAccountingNumeric
1052 #cjkOtherNumeric; kOtherNumeric
1053 #cjkPrimaryNumeric; kPrimaryNumeric
1054 #cjkCompatibilityVariant; kCompatibilityVariant
1055 #cjkIICore ; kIICore
1056 #cjkIRG_GSource; kIRG_GSource
1057 #cjkIRG_HSource; kIRG_HSource
1058 #cjkIRG_JSource; kIRG_JSource
1059 #cjkIRG_KPSource; kIRG_KPSource
1060 #cjkIRG_KSource; kIRG_KSource
1061 #cjkIRG_TSource; kIRG_TSource
1062 #cjkIRG_USource; kIRG_USource
1063 #cjkIRG_VSource; kIRG_VSource
1064 #cjkRSUnicode; kRSUnicode                ; Unicode_Radical_Stroke; URS
1065 END
1066
1067 # Similarly for the property values.  For your convenience, the lines in the
1068 # 6.0 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
1069 # '#' marks (for Unicode versions before 5.2)
1070 my @cjk_property_values = split "\n", <<'END';
1071 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1072 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1073 ## @missing: 0000..10FFFF; cjkIICore; <none>
1074 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1075 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1076 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1077 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1078 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1079 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1080 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1081 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1082 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1083 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1084 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1085 END
1086
1087 # The input files don't list every code point.  Those not listed are to be
1088 # defaulted to some value.  Below are hard-coded what those values are for
1089 # non-binary properties as of 5.1.  Starting in 5.0, there are
1090 # machine-parsable comment lines in the files that give the defaults; so this
1091 # list shouldn't have to be extended.  The claim is that all missing entries
1092 # for binary properties will default to 'N'.  Unicode tried to change that in
1093 # 5.2, but the beta period produced enough protest that they backed off.
1094 #
1095 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1096 # be in the form that it expects.  The others may be synonyms.
1097 my $CODE_POINT = '<code point>';
1098 my %default_mapping = (
1099     Age => "Unassigned",
1100     # Bidi_Class => Complicated; set in code
1101     Bidi_Mirroring_Glyph => "",
1102     Block => 'No_Block',
1103     Canonical_Combining_Class => 0,
1104     Case_Folding => $CODE_POINT,
1105     Decomposition_Mapping => $CODE_POINT,
1106     Decomposition_Type => 'None',
1107     East_Asian_Width => "Neutral",
1108     FC_NFKC_Closure => $CODE_POINT,
1109     General_Category => 'Cn',
1110     Grapheme_Cluster_Break => 'Other',
1111     Hangul_Syllable_Type => 'NA',
1112     ISO_Comment => "",
1113     Jamo_Short_Name => "",
1114     Joining_Group => "No_Joining_Group",
1115     # Joining_Type => Complicated; set in code
1116     kIICore => 'N',   #                       Is converted to binary
1117     #Line_Break => Complicated; set in code
1118     Lowercase_Mapping => $CODE_POINT,
1119     Name => "",
1120     Name_Alias => "",
1121     NFC_QC => 'Yes',
1122     NFD_QC => 'Yes',
1123     NFKC_QC => 'Yes',
1124     NFKD_QC => 'Yes',
1125     Numeric_Type => 'None',
1126     Numeric_Value => 'NaN',
1127     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1128     Sentence_Break => 'Other',
1129     Simple_Case_Folding => $CODE_POINT,
1130     Simple_Lowercase_Mapping => $CODE_POINT,
1131     Simple_Titlecase_Mapping => $CODE_POINT,
1132     Simple_Uppercase_Mapping => $CODE_POINT,
1133     Titlecase_Mapping => $CODE_POINT,
1134     Unicode_1_Name => "",
1135     Unicode_Radical_Stroke => "",
1136     Uppercase_Mapping => $CODE_POINT,
1137     Word_Break => 'Other',
1138 );
1139
1140 # Below are files that Unicode furnishes, but this program ignores, and why.
1141 # NormalizationCorrections.txt requires some more explanation.  It documents
1142 # the cumulative fixes to erroneous normalizations in earlier Unicode
1143 # versions.  Its main purpose is so that someone running on an earlier version
1144 # can use this file to override what got published in that earlier release.
1145 # It would be easy for mktables to read and handle this file.  But all the
1146 # corrections in it should already be in the other files for the release it
1147 # is.  To get it to actually mean something useful, someone would have to be
1148 # using an earlier Unicode release, and copy it to the files for that release
1149 # and recomplile.  So far there has been no demand to do that, so this hasn't
1150 # been implemented.
1151 my %ignored_files = (
1152     'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1153     'Index.txt' => 'Alphabetical index of Unicode characters',
1154     'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1155     'NamesList.txt' => 'Annotated list of characters',
1156     'NamesList.html' => 'Describes the format and contents of F<NamesList.txt>',
1157     'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1158     'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1159     'ReadMe.txt' => 'Documentation',
1160     'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized.  This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1161     'StandardizedVariants.html' => 'Provides a visual display of the standard variant sequences derived from F<StandardizedVariants.txt>.',
1162     'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1163     'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters',
1164     'USourceGlyphs.pdf' => 'Pictures of the characters in F<USourceData.txt>',
1165     'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1166     'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1167     'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1168     'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
1169 );
1170
1171 my %skipped_files;  # List of files that we skip
1172
1173 ### End of externally interesting definitions, except for @input_file_objects
1174
1175 my $HEADER=<<"EOF";
1176 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1177 # This file is machine-generated by $0 from the Unicode
1178 # database, Version $string_version.  Any changes made here will be lost!
1179 EOF
1180
1181 my $INTERNAL_ONLY_HEADER = <<"EOF";
1182
1183 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1184 # This file is for internal use by core Perl only.  The format and even the
1185 # name or existence of this file are subject to change without notice.  Don't
1186 # use it directly.  Use Unicode::UCD to access the Unicode character data
1187 # base.
1188 EOF
1189
1190 my $DEVELOPMENT_ONLY=<<"EOF";
1191 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1192 # This file contains information artificially constrained to code points
1193 # present in Unicode release $string_compare_versions.
1194 # IT CANNOT BE RELIED ON.  It is for use during development only and should
1195 # not be used for production.
1196
1197 EOF
1198
1199 my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1200 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1201 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1202
1203 # We work with above-Unicode code points, up to UV_MAX.   But when you get
1204 # that high, above IV_MAX, some operations don't work, and you can easily get
1205 # overflow.  Therefore for internal use, we use a much smaller number,
1206 # translating it to UV_MAX only for output.  The exact number is immaterial
1207 # (all Unicode code points are treated exactly the same), but the algorithm
1208 # requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
1209 my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1210 my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1211 my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1212
1213 my $MAX_PLATFORM_CODEPOINT = ~0;
1214
1215 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
1216 # two must be 10; if there are 5, the first must not be a 0.  Written this way
1217 # to decrease backtracking.  The first regex allows the code point to be at
1218 # the end of a word, but to work properly, the word shouldn't end with a valid
1219 # hex character.  The second one won't match a code point at the end of a
1220 # word, and doesn't have the run-on issue
1221 my $run_on_code_point_re =
1222             qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1223 my $code_point_re = qr/\b$run_on_code_point_re/;
1224
1225 # This matches the beginning of the line in the Unicode db files that give the
1226 # defaults for code points not listed (i.e., missing) in the file.  The code
1227 # depends on this ending with a semi-colon, so it can assume it is a valid
1228 # field when the line is split() by semi-colons
1229 my $missing_defaults_prefix =
1230             qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
1231
1232 # Property types.  Unicode has more types, but these are sufficient for our
1233 # purposes.
1234 my $UNKNOWN = -1;   # initialized to illegal value
1235 my $NON_STRING = 1; # Either binary or enum
1236 my $BINARY = 2;
1237 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1238                        # tables, additional true and false tables are
1239                        # generated so that false is anything matching the
1240                        # default value, and true is everything else.
1241 my $ENUM = 4;       # Include catalog
1242 my $STRING = 5;     # Anything else: string or misc
1243
1244 # Some input files have lines that give default values for code points not
1245 # contained in the file.  Sometimes these should be ignored.
1246 my $NO_DEFAULTS = 0;        # Must evaluate to false
1247 my $NOT_IGNORED = 1;
1248 my $IGNORED = 2;
1249
1250 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
1251 # and will appear in the main body of the tables in the output files, but
1252 # there are other types of ranges as well, listed below, that are specially
1253 # handled.   There are pseudo-types as well that will never be stored as a
1254 # type, but will affect the calculation of the type.
1255
1256 # 0 is for normal, non-specials
1257 my $MULTI_CP = 1;           # Sequence of more than code point
1258 my $HANGUL_SYLLABLE = 2;
1259 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1260 my $NULL = 4;               # The map is to the null string; utf8.c can't
1261                             # handle these, nor is there an accepted syntax
1262                             # for them in \p{} constructs
1263 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1264                              # otherwise be $MULTI_CP type are instead type 0
1265
1266 # process_generic_property_file() can accept certain overrides in its input.
1267 # Each of these must begin AND end with $CMD_DELIM.
1268 my $CMD_DELIM = "\a";
1269 my $REPLACE_CMD = 'replace';    # Override the Replace
1270 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1271
1272 my $NO = 0;
1273 my $YES = 1;
1274
1275 # Values for the Replace argument to add_range.
1276 # $NO                      # Don't replace; add only the code points not
1277                            # already present.
1278 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1279                            # the comments at the subroutine definition.
1280 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1281 my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1282                            # already there
1283 my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1284                            # already there
1285 my $CROAK = 6;             # Die with an error if is already there
1286
1287 # Flags to give property statuses.  The phrases are to remind maintainers that
1288 # if the flag is changed, the indefinite article referring to it in the
1289 # documentation may need to be as well.
1290 my $NORMAL = "";
1291 my $DEPRECATED = 'D';
1292 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1293 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1294 my $DISCOURAGED = 'X';
1295 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1296 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1297 my $STRICTER = 'T';
1298 my $a_bold_stricter = "a 'B<$STRICTER>'";
1299 my $A_bold_stricter = "A 'B<$STRICTER>'";
1300 my $STABILIZED = 'S';
1301 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1302 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1303 my $OBSOLETE = 'O';
1304 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1305 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1306
1307 my %status_past_participles = (
1308     $DISCOURAGED => 'discouraged',
1309     $STABILIZED => 'stabilized',
1310     $OBSOLETE => 'obsolete',
1311     $DEPRECATED => 'deprecated',
1312 );
1313
1314 # Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1315 # externally documented.
1316 my $ORDINARY = 0;       # The normal fate.
1317 my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1318                         # but there is a file written that can be used to
1319                         # reconstruct this table
1320 my $INTERNAL_ONLY = 2;  # The file for this table is written out, but it is
1321                         # for Perl's internal use only
1322 my $LEGACY_ONLY = 3;    # Like $INTERNAL_ONLY, but not actually used by Perl.
1323                         # Is for backwards compatibility for applications that
1324                         # read the file directly, so it's format is
1325                         # unchangeable.
1326 my $SUPPRESSED = 4;     # The file for this table is not written out, and as a
1327                         # result, we don't bother to do many computations on
1328                         # it.
1329 my $PLACEHOLDER = 5;    # Like $SUPPRESSED, but we go through all the
1330                         # computations anyway, as the values are needed for
1331                         # things to work.  This happens when we have Perl
1332                         # extensions that depend on Unicode tables that
1333                         # wouldn't normally be in a given Unicode version.
1334
1335 # The format of the values of the tables:
1336 my $EMPTY_FORMAT = "";
1337 my $BINARY_FORMAT = 'b';
1338 my $DECIMAL_FORMAT = 'd';
1339 my $FLOAT_FORMAT = 'f';
1340 my $INTEGER_FORMAT = 'i';
1341 my $HEX_FORMAT = 'x';
1342 my $RATIONAL_FORMAT = 'r';
1343 my $STRING_FORMAT = 's';
1344 my $ADJUST_FORMAT = 'a';
1345 my $HEX_ADJUST_FORMAT = 'ax';
1346 my $DECOMP_STRING_FORMAT = 'c';
1347 my $STRING_WHITE_SPACE_LIST = 'sw';
1348
1349 my %map_table_formats = (
1350     $BINARY_FORMAT => 'binary',
1351     $DECIMAL_FORMAT => 'single decimal digit',
1352     $FLOAT_FORMAT => 'floating point number',
1353     $INTEGER_FORMAT => 'integer',
1354     $HEX_FORMAT => 'non-negative hex whole number; a code point',
1355     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1356     $STRING_FORMAT => 'string',
1357     $ADJUST_FORMAT => 'some entries need adjustment',
1358     $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1359     $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1360     $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1361 );
1362
1363 # Unicode didn't put such derived files in a separate directory at first.
1364 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1365 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1366 my $AUXILIARY = 'auxiliary';
1367
1368 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1369 # and into UCD.pl for the use of UCD.pm
1370 my %loose_to_file_of;       # loosely maps table names to their respective
1371                             # files
1372 my %stricter_to_file_of;    # same; but for stricter mapping.
1373 my %loose_property_to_file_of; # Maps a loose property name to its map file
1374 my %file_to_swash_name;     # Maps the file name to its corresponding key name
1375                             # in the hash %utf8::SwashInfo
1376 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1377                              # their rational equivalent
1378 my %loose_property_name_of; # Loosely maps (non_string) property names to
1379                             # standard form
1380 my %string_property_loose_to_name; # Same, for string properties.
1381 my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1382                             # the property name in standard loose form, and
1383                             # 'value' is the default value for that property,
1384                             # also in standard loose form.
1385 my %loose_to_standard_value; # loosely maps table names to the canonical
1386                             # alias for them
1387 my %ambiguous_names;        # keys are alias names (in standard form) that
1388                             # have more than one possible meaning.
1389 my %prop_aliases;           # Keys are standard property name; values are each
1390                             # one's aliases
1391 my %prop_value_aliases;     # Keys of top level are standard property name;
1392                             # values are keys to another hash,  Each one is
1393                             # one of the property's values, in standard form.
1394                             # The values are that prop-val's aliases.
1395 my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1396
1397 # Most properties are immune to caseless matching, otherwise you would get
1398 # nonsensical results, as properties are a function of a code point, not
1399 # everything that is caselessly equivalent to that code point.  For example,
1400 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1401 # be true because 's' and 'S' are equivalent caselessly.  However,
1402 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1403 # extend that concept to those very few properties that are like this.  Each
1404 # such property will match the full range caselessly.  They are hard-coded in
1405 # the program; it's not worth trying to make it general as it's extremely
1406 # unlikely that they will ever change.
1407 my %caseless_equivalent_to;
1408
1409 # These constants names and values were taken from the Unicode standard,
1410 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1411 # syllables.  The '_string' versions are so generated tables can retain the
1412 # hex format, which is the more familiar value
1413 my $SBase_string = "0xAC00";
1414 my $SBase = CORE::hex $SBase_string;
1415 my $LBase_string = "0x1100";
1416 my $LBase = CORE::hex $LBase_string;
1417 my $VBase_string = "0x1161";
1418 my $VBase = CORE::hex $VBase_string;
1419 my $TBase_string = "0x11A7";
1420 my $TBase = CORE::hex $TBase_string;
1421 my $SCount = 11172;
1422 my $LCount = 19;
1423 my $VCount = 21;
1424 my $TCount = 28;
1425 my $NCount = $VCount * $TCount;
1426
1427 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1428 # with the above published constants.
1429 my %Jamo;
1430 my %Jamo_L;     # Leading consonants
1431 my %Jamo_V;     # Vowels
1432 my %Jamo_T;     # Trailing consonants
1433
1434 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1435 # The key is the base name of the code point, and the value is an
1436 # array giving all the ranges that use this base name.  Each range
1437 # is actually a hash giving the 'low' and 'high' values of it.
1438 my %names_ending_in_code_point;
1439 my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1440                                         # removed from the names
1441 # Inverse mapping.  The list of ranges that have these kinds of
1442 # names.  Each element contains the low, high, and base names in an
1443 # anonymous hash.
1444 my @code_points_ending_in_code_point;
1445
1446 # To hold Unicode's normalization test suite
1447 my @normalization_tests;
1448
1449 # Boolean: does this Unicode version have the hangul syllables, and are we
1450 # writing out a table for them?
1451 my $has_hangul_syllables = 0;
1452
1453 # Does this Unicode version have code points whose names end in their
1454 # respective code points, and are we writing out a table for them?  0 for no;
1455 # otherwise points to first property that a table is needed for them, so that
1456 # if multiple tables are needed, we don't create duplicates
1457 my $needing_code_points_ending_in_code_point = 0;
1458
1459 my @backslash_X_tests;     # List of tests read in for testing \X
1460 my @unhandled_properties;  # Will contain a list of properties found in
1461                            # the input that we didn't process.
1462 my @match_properties;      # Properties that have match tables, to be
1463                            # listed in the pod
1464 my @map_properties;        # Properties that get map files written
1465 my @named_sequences;       # NamedSequences.txt contents.
1466 my %potential_files;       # Generated list of all .txt files in the directory
1467                            # structure so we can warn if something is being
1468                            # ignored.
1469 my @files_actually_output; # List of files we generated.
1470 my @more_Names;            # Some code point names are compound; this is used
1471                            # to store the extra components of them.
1472 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1473                            # the minimum before we consider it equivalent to a
1474                            # candidate rational
1475 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1476
1477 # These store references to certain commonly used property objects
1478 my $ccc;
1479 my $gc;
1480 my $perl;
1481 my $block;
1482 my $perl_charname;
1483 my $print;
1484 my $All;
1485 my $script;
1486
1487 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1488 my $has_In_conflicts = 0;
1489 my $has_Is_conflicts = 0;
1490
1491 sub internal_file_to_platform ($) {
1492     # Convert our file paths which have '/' separators to those of the
1493     # platform.
1494
1495     my $file = shift;
1496     return undef unless defined $file;
1497
1498     return File::Spec->join(split '/', $file);
1499 }
1500
1501 sub file_exists ($) {   # platform independent '-e'.  This program internally
1502                         # uses slash as a path separator.
1503     my $file = shift;
1504     return 0 if ! defined $file;
1505     return -e internal_file_to_platform($file);
1506 }
1507
1508 sub objaddr($) {
1509     # Returns the address of the blessed input object.
1510     # It doesn't check for blessedness because that would do a string eval
1511     # every call, and the program is structured so that this is never called
1512     # for a non-blessed object.
1513
1514     no overloading; # If overloaded, numifying below won't work.
1515
1516     # Numifying a ref gives its address.
1517     return pack 'J', $_[0];
1518 }
1519
1520 # These are used only if $annotate is true.
1521 # The entire range of Unicode characters is examined to populate these
1522 # after all the input has been processed.  But most can be skipped, as they
1523 # have the same descriptive phrases, such as being unassigned
1524 my @viacode;            # Contains the 1 million character names
1525 my @printable;          # boolean: And are those characters printable?
1526 my @annotate_char_type; # Contains a type of those characters, specifically
1527                         # for the purposes of annotation.
1528 my $annotate_ranges;    # A map of ranges of code points that have the same
1529                         # name for the purposes of annotation.  They map to the
1530                         # upper edge of the range, so that the end point can
1531                         # be immediately found.  This is used to skip ahead to
1532                         # the end of a range, and avoid processing each
1533                         # individual code point in it.
1534 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1535                                    # characters, but excluding those which are
1536                                    # also noncharacter code points
1537
1538 # The annotation types are an extension of the regular range types, though
1539 # some of the latter are folded into one.  Make the new types negative to
1540 # avoid conflicting with the regular types
1541 my $SURROGATE_TYPE = -1;
1542 my $UNASSIGNED_TYPE = -2;
1543 my $PRIVATE_USE_TYPE = -3;
1544 my $NONCHARACTER_TYPE = -4;
1545 my $CONTROL_TYPE = -5;
1546 my $ABOVE_UNICODE_TYPE = -6;
1547 my $UNKNOWN_TYPE = -7;  # Used only if there is a bug in this program
1548
1549 sub populate_char_info ($) {
1550     # Used only with the $annotate option.  Populates the arrays with the
1551     # input code point's info that are needed for outputting more detailed
1552     # comments.  If calling context wants a return, it is the end point of
1553     # any contiguous range of characters that share essentially the same info
1554
1555     my $i = shift;
1556     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1557
1558     $viacode[$i] = $perl_charname->value_of($i) || "";
1559
1560     # A character is generally printable if Unicode says it is,
1561     # but below we make sure that most Unicode general category 'C' types
1562     # aren't.
1563     $printable[$i] = $print->contains($i);
1564
1565     $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1566
1567     # Only these two regular types are treated specially for annotations
1568     # purposes
1569     $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1570                                 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1571
1572     # Give a generic name to all code points that don't have a real name.
1573     # We output ranges, if applicable, for these.  Also calculate the end
1574     # point of the range.
1575     my $end;
1576     if (! $viacode[$i]) {
1577         my $nonchar;
1578         if ($i > $MAX_UNICODE_CODEPOINT) {
1579             $viacode[$i] = 'Above-Unicode';
1580             $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1581             $printable[$i] = 0;
1582             $end = $MAX_WORKING_CODEPOINT;
1583         }
1584         elsif ($gc-> table('Private_use')->contains($i)) {
1585             $viacode[$i] = 'Private Use';
1586             $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1587             $printable[$i] = 0;
1588             $end = $gc->table('Private_Use')->containing_range($i)->end;
1589         }
1590         elsif ((defined ($nonchar =
1591                             Property::property_ref('Noncharacter_Code_Point'))
1592                && $nonchar->table('Y')->contains($i)))
1593         {
1594             $viacode[$i] = 'Noncharacter';
1595             $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1596             $printable[$i] = 0;
1597             $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1598                                                     containing_range($i)->end;
1599         }
1600         elsif ($gc-> table('Control')->contains($i)) {
1601             $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control';
1602             $annotate_char_type[$i] = $CONTROL_TYPE;
1603             $printable[$i] = 0;
1604         }
1605         elsif ($gc-> table('Unassigned')->contains($i)) {
1606             $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1607             $printable[$i] = 0;
1608             if ($v_version lt v2.0.0) { # No blocks in earliest releases
1609                 $viacode[$i] = 'Unassigned';
1610                 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1611             }
1612             else {
1613                 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1614
1615                 # Because we name the unassigned by the blocks they are in, it
1616                 # can't go past the end of that block, and it also can't go
1617                 # past the unassigned range it is in.  The special table makes
1618                 # sure that the non-characters, which are unassigned, are
1619                 # separated out.
1620                 $end = min($block->containing_range($i)->end,
1621                            $unassigned_sans_noncharacters->
1622                                                     containing_range($i)->end);
1623             }
1624         }
1625         elsif ($v_version lt v2.0.0) {  # No surrogates in earliest releases
1626             $viacode[$i] = $gc->value_of($i);
1627             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1628             $printable[$i] = 0;
1629         }
1630         elsif ($gc-> table('Surrogate')->contains($i)) {
1631             $viacode[$i] = 'Surrogate';
1632             $annotate_char_type[$i] = $SURROGATE_TYPE;
1633             $printable[$i] = 0;
1634             $end = $gc->table('Surrogate')->containing_range($i)->end;
1635         }
1636         else {
1637             Carp::my_carp_bug("Can't figure out how to annotate "
1638                               . sprintf("U+%04X", $i)
1639                               . ".  Proceeding anyway.");
1640             $viacode[$i] = 'UNKNOWN';
1641             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1642             $printable[$i] = 0;
1643         }
1644     }
1645
1646     # Here, has a name, but if it's one in which the code point number is
1647     # appended to the name, do that.
1648     elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1649         $viacode[$i] .= sprintf("-%04X", $i);
1650         $end = $perl_charname->containing_range($i)->end;
1651     }
1652
1653     # And here, has a name, but if it's a hangul syllable one, replace it with
1654     # the correct name from the Unicode algorithm
1655     elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1656         use integer;
1657         my $SIndex = $i - $SBase;
1658         my $L = $LBase + $SIndex / $NCount;
1659         my $V = $VBase + ($SIndex % $NCount) / $TCount;
1660         my $T = $TBase + $SIndex % $TCount;
1661         $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1662         $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1663         $end = $perl_charname->containing_range($i)->end;
1664     }
1665
1666     return if ! defined wantarray;
1667     return $i if ! defined $end;    # If not a range, return the input
1668
1669     # Save this whole range so can find the end point quickly
1670     $annotate_ranges->add_map($i, $end, $end);
1671
1672     return $end;
1673 }
1674
1675 # Commented code below should work on Perl 5.8.
1676 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1677 ## the native perl version of it (which is what would operate under miniperl)
1678 ## is extremely slow, as it does a string eval every call.
1679 #my $has_fast_scalar_util = $^X !~ /miniperl/
1680 #                            && defined eval "require Scalar::Util";
1681 #
1682 #sub objaddr($) {
1683 #    # Returns the address of the blessed input object.  Uses the XS version if
1684 #    # available.  It doesn't check for blessedness because that would do a
1685 #    # string eval every call, and the program is structured so that this is
1686 #    # never called for a non-blessed object.
1687 #
1688 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1689 #
1690 #    # Check at least that is a ref.
1691 #    my $pkg = ref($_[0]) or return undef;
1692 #
1693 #    # Change to a fake package to defeat any overloaded stringify
1694 #    bless $_[0], 'main::Fake';
1695 #
1696 #    # Numifying a ref gives its address.
1697 #    my $addr = pack 'J', $_[0];
1698 #
1699 #    # Return to original class
1700 #    bless $_[0], $pkg;
1701 #    return $addr;
1702 #}
1703
1704 sub max ($$) {
1705     my $a = shift;
1706     my $b = shift;
1707     return $a if $a >= $b;
1708     return $b;
1709 }
1710
1711 sub min ($$) {
1712     my $a = shift;
1713     my $b = shift;
1714     return $a if $a <= $b;
1715     return $b;
1716 }
1717
1718 sub clarify_number ($) {
1719     # This returns the input number with underscores inserted every 3 digits
1720     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1721     # checked.
1722
1723     my $number = shift;
1724     my $pos = length($number) - 3;
1725     return $number if $pos <= 1;
1726     while ($pos > 0) {
1727         substr($number, $pos, 0) = '_';
1728         $pos -= 3;
1729     }
1730     return $number;
1731 }
1732
1733 sub clarify_code_point_count ($) {
1734     # This is like clarify_number(), but the input is assumed to be a count of
1735     # code points, rather than a generic number.
1736
1737     my $append = "";
1738
1739     my $number = shift;
1740     if ($number > $MAX_UNICODE_CODEPOINTS) {
1741         $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1742         return "All above-Unicode code points" if $number == 0;
1743         $append = " + all above-Unicode code points";
1744     }
1745     return clarify_number($number) . $append;
1746 }
1747
1748 package Carp;
1749
1750 # These routines give a uniform treatment of messages in this program.  They
1751 # are placed in the Carp package to cause the stack trace to not include them,
1752 # although an alternative would be to use another package and set @CARP_NOT
1753 # for it.
1754
1755 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1756
1757 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1758 # and overload trying to load Scalar:Util under miniperl.  See
1759 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1760 undef $overload::VERSION;
1761
1762 sub my_carp {
1763     my $message = shift || "";
1764     my $nofold = shift || 0;
1765
1766     if ($message) {
1767         $message = main::join_lines($message);
1768         $message =~ s/^$0: *//;     # Remove initial program name
1769         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1770         $message = "\n$0: $message;";
1771
1772         # Fold the message with program name, semi-colon end punctuation
1773         # (which looks good with the message that carp appends to it), and a
1774         # hanging indent for continuation lines.
1775         $message = main::simple_fold($message, "", 4) unless $nofold;
1776         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1777                                     # appends is to the same line
1778     }
1779
1780     return $message if defined wantarray;   # If a caller just wants the msg
1781
1782     carp $message;
1783     return;
1784 }
1785
1786 sub my_carp_bug {
1787     # This is called when it is clear that the problem is caused by a bug in
1788     # this program.
1789
1790     my $message = shift;
1791     $message =~ s/^$0: *//;
1792     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1793     carp $message;
1794     return;
1795 }
1796
1797 sub carp_too_few_args {
1798     if (@_ != 2) {
1799         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1800         return;
1801     }
1802
1803     my $args_ref = shift;
1804     my $count = shift;
1805
1806     my_carp_bug("Need at least $count arguments to "
1807         . (caller 1)[3]
1808         . ".  Instead got: '"
1809         . join ', ', @$args_ref
1810         . "'.  No action taken.");
1811     return;
1812 }
1813
1814 sub carp_extra_args {
1815     my $args_ref = shift;
1816     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1817
1818     unless (ref $args_ref) {
1819         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1820         return;
1821     }
1822     my ($package, $file, $line) = caller;
1823     my $subroutine = (caller 1)[3];
1824
1825     my $list;
1826     if (ref $args_ref eq 'HASH') {
1827         foreach my $key (keys %$args_ref) {
1828             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1829         }
1830         $list = join ', ', each %{$args_ref};
1831     }
1832     elsif (ref $args_ref eq 'ARRAY') {
1833         foreach my $arg (@$args_ref) {
1834             $arg = $UNDEF unless defined $arg;
1835         }
1836         $list = join ', ', @$args_ref;
1837     }
1838     else {
1839         my_carp_bug("Can't cope with ref "
1840                 . ref($args_ref)
1841                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1842         return;
1843     }
1844
1845     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1846     return;
1847 }
1848
1849 package main;
1850
1851 { # Closure
1852
1853     # This program uses the inside-out method for objects, as recommended in
1854     # "Perl Best Practices".  This closure aids in generating those.  There
1855     # are two routines.  setup_package() is called once per package to set
1856     # things up, and then set_access() is called for each hash representing a
1857     # field in the object.  These routines arrange for the object to be
1858     # properly destroyed when no longer used, and for standard accessor
1859     # functions to be generated.  If you need more complex accessors, just
1860     # write your own and leave those accesses out of the call to set_access().
1861     # More details below.
1862
1863     my %constructor_fields; # fields that are to be used in constructors; see
1864                             # below
1865
1866     # The values of this hash will be the package names as keys to other
1867     # hashes containing the name of each field in the package as keys, and
1868     # references to their respective hashes as values.
1869     my %package_fields;
1870
1871     sub setup_package {
1872         # Sets up the package, creating standard DESTROY and dump methods
1873         # (unless already defined).  The dump method is used in debugging by
1874         # simple_dumper().
1875         # The optional parameters are:
1876         #   a)  a reference to a hash, that gets populated by later
1877         #       set_access() calls with one of the accesses being
1878         #       'constructor'.  The caller can then refer to this, but it is
1879         #       not otherwise used by these two routines.
1880         #   b)  a reference to a callback routine to call during destruction
1881         #       of the object, before any fields are actually destroyed
1882
1883         my %args = @_;
1884         my $constructor_ref = delete $args{'Constructor_Fields'};
1885         my $destroy_callback = delete $args{'Destroy_Callback'};
1886         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1887
1888         my %fields;
1889         my $package = (caller)[0];
1890
1891         $package_fields{$package} = \%fields;
1892         $constructor_fields{$package} = $constructor_ref;
1893
1894         unless ($package->can('DESTROY')) {
1895             my $destroy_name = "${package}::DESTROY";
1896             no strict "refs";
1897
1898             # Use typeglob to give the anonymous subroutine the name we want
1899             *$destroy_name = sub {
1900                 my $self = shift;
1901                 my $addr = do { no overloading; pack 'J', $self; };
1902
1903                 $self->$destroy_callback if $destroy_callback;
1904                 foreach my $field (keys %{$package_fields{$package}}) {
1905                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1906                     delete $package_fields{$package}{$field}{$addr};
1907                 }
1908                 return;
1909             }
1910         }
1911
1912         unless ($package->can('dump')) {
1913             my $dump_name = "${package}::dump";
1914             no strict "refs";
1915             *$dump_name = sub {
1916                 my $self = shift;
1917                 return dump_inside_out($self, $package_fields{$package}, @_);
1918             }
1919         }
1920         return;
1921     }
1922
1923     sub set_access {
1924         # Arrange for the input field to be garbage collected when no longer
1925         # needed.  Also, creates standard accessor functions for the field
1926         # based on the optional parameters-- none if none of these parameters:
1927         #   'addable'    creates an 'add_NAME()' accessor function.
1928         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1929         #                function.
1930         #   'settable'   creates a 'set_NAME()' accessor function.
1931         #   'constructor' doesn't create an accessor function, but adds the
1932         #                field to the hash that was previously passed to
1933         #                setup_package();
1934         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1935         # 'add' etc. all mean 'addable'.
1936         # The read accessor function will work on both array and scalar
1937         # values.  If another accessor in the parameter list is 'a', the read
1938         # access assumes an array.  You can also force it to be array access
1939         # by specifying 'readable_array' instead of 'readable'
1940         #
1941         # A sort-of 'protected' access can be set-up by preceding the addable,
1942         # readable or settable with some initial portion of 'protected_' (but,
1943         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1944         # "protection" is only by convention.  All that happens is that the
1945         # accessor functions' names begin with an underscore.  So instead of
1946         # calling set_foo, the call is _set_foo.  (Real protection could be
1947         # accomplished by having a new subroutine, end_package, called at the
1948         # end of each package, and then storing the __LINE__ ranges and
1949         # checking them on every accessor.  But that is way overkill.)
1950
1951         # We create anonymous subroutines as the accessors and then use
1952         # typeglobs to assign them to the proper package and name
1953
1954         my $name = shift;   # Name of the field
1955         my $field = shift;  # Reference to the inside-out hash containing the
1956                             # field
1957
1958         my $package = (caller)[0];
1959
1960         if (! exists $package_fields{$package}) {
1961             croak "$0: Must call 'setup_package' before 'set_access'";
1962         }
1963
1964         # Stash the field so DESTROY can get it.
1965         $package_fields{$package}{$name} = $field;
1966
1967         # Remaining arguments are the accessors.  For each...
1968         foreach my $access (@_) {
1969             my $access = lc $access;
1970
1971             my $protected = "";
1972
1973             # Match the input as far as it goes.
1974             if ($access =~ /^(p[^_]*)_/) {
1975                 $protected = $1;
1976                 if (substr('protected_', 0, length $protected)
1977                     eq $protected)
1978                 {
1979
1980                     # Add 1 for the underscore not included in $protected
1981                     $access = substr($access, length($protected) + 1);
1982                     $protected = '_';
1983                 }
1984                 else {
1985                     $protected = "";
1986                 }
1987             }
1988
1989             if (substr('addable', 0, length $access) eq $access) {
1990                 my $subname = "${package}::${protected}add_$name";
1991                 no strict "refs";
1992
1993                 # add_ accessor.  Don't add if already there, which we
1994                 # determine using 'eq' for scalars and '==' otherwise.
1995                 *$subname = sub {
1996                     use strict "refs";
1997                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1998                     my $self = shift;
1999                     my $value = shift;
2000                     my $addr = do { no overloading; pack 'J', $self; };
2001                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2002                     if (ref $value) {
2003                         return if grep { $value == $_ } @{$field->{$addr}};
2004                     }
2005                     else {
2006                         return if grep { $value eq $_ } @{$field->{$addr}};
2007                     }
2008                     push @{$field->{$addr}}, $value;
2009                     return;
2010                 }
2011             }
2012             elsif (substr('constructor', 0, length $access) eq $access) {
2013                 if ($protected) {
2014                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
2015                 }
2016                 else {
2017                     $constructor_fields{$package}{$name} = $field;
2018                 }
2019             }
2020             elsif (substr('readable_array', 0, length $access) eq $access) {
2021
2022                 # Here has read access.  If one of the other parameters for
2023                 # access is array, or this one specifies array (by being more
2024                 # than just 'readable_'), then create a subroutine that
2025                 # assumes the data is an array.  Otherwise just a scalar
2026                 my $subname = "${package}::${protected}$name";
2027                 if (grep { /^a/i } @_
2028                     or length($access) > length('readable_'))
2029                 {
2030                     no strict "refs";
2031                     *$subname = sub {
2032                         use strict "refs";
2033                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2034                         my $addr = do { no overloading; pack 'J', $_[0]; };
2035                         if (ref $field->{$addr} ne 'ARRAY') {
2036                             my $type = ref $field->{$addr};
2037                             $type = 'scalar' unless $type;
2038                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
2039                             return;
2040                         }
2041                         return scalar @{$field->{$addr}} unless wantarray;
2042
2043                         # Make a copy; had problems with caller modifying the
2044                         # original otherwise
2045                         my @return = @{$field->{$addr}};
2046                         return @return;
2047                     }
2048                 }
2049                 else {
2050
2051                     # Here not an array value, a simpler function.
2052                     no strict "refs";
2053                     *$subname = sub {
2054                         use strict "refs";
2055                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2056                         no overloading;
2057                         return $field->{pack 'J', $_[0]};
2058                     }
2059                 }
2060             }
2061             elsif (substr('settable', 0, length $access) eq $access) {
2062                 my $subname = "${package}::${protected}set_$name";
2063                 no strict "refs";
2064                 *$subname = sub {
2065                     use strict "refs";
2066                     if (main::DEBUG) {
2067                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2068                         Carp::carp_extra_args(\@_) if @_ > 2;
2069                     }
2070                     # $self is $_[0]; $value is $_[1]
2071                     no overloading;
2072                     $field->{pack 'J', $_[0]} = $_[1];
2073                     return;
2074                 }
2075             }
2076             else {
2077                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
2078             }
2079         }
2080         return;
2081     }
2082 }
2083
2084 package Input_file;
2085
2086 # All input files use this object, which stores various attributes about them,
2087 # and provides for convenient, uniform handling.  The run method wraps the
2088 # processing.  It handles all the bookkeeping of opening, reading, and closing
2089 # the file, returning only significant input lines.
2090 #
2091 # Each object gets a handler which processes the body of the file, and is
2092 # called by run().  All character property files must use the generic,
2093 # default handler, which has code scrubbed to handle things you might not
2094 # expect, including automatic EBCDIC handling.  For files that don't deal with
2095 # mapping code points to a property value, such as test files,
2096 # PropertyAliases, PropValueAliases, and named sequences, you can override the
2097 # handler to be a custom one.  Such a handler should basically be a
2098 # while(next_line()) {...} loop.
2099 #
2100 # You can also set up handlers to
2101 #   1) call before the first line is read, for pre processing
2102 #   2) call to adjust each line of the input before the main handler gets
2103 #      them.  This can be automatically generated, if appropriately simple
2104 #      enough, by specifiying a Properties parameter in the constructor.
2105 #   3) call upon EOF before the main handler exits its loop
2106 #   4) call at the end, for post processing
2107 #
2108 # $_ is used to store the input line, and is to be filtered by the
2109 # each_line_handler()s.  So, if the format of the line is not in the desired
2110 # format for the main handler, these are used to do that adjusting.  They can
2111 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2112 # so the $_ output of one is used as the input to the next.  None of the other
2113 # handlers are stackable, but could easily be changed to be so.
2114 #
2115 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2116 # which insert the parameters as lines to be processed before the next input
2117 # file line is read.  This allows the EOF handler to flush buffers, for
2118 # example.  The difference between the two routines is that the lines inserted
2119 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
2120 # called it from such a handler, you would get infinite recursion.)  Lines
2121 # inserted by insert_adjusted_lines() go directly to the main handler without
2122 # any adjustments.  If the  post-processing handler calls any of these, there
2123 # will be no effect.  Some error checking for these conditions could be added,
2124 # but it hasn't been done.
2125 #
2126 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2127 # to prevent further processing of the line.  This routine will output the
2128 # message as a warning once, and then keep a count of the lines that have the
2129 # same message, and output that count at the end of the file's processing.
2130 # This keeps the number of messages down to a manageable amount.
2131 #
2132 # get_missings() should be called to retrieve any @missing input lines.
2133 # Messages will be raised if this isn't done if the options aren't to ignore
2134 # missings.
2135
2136 sub trace { return main::trace(@_); }
2137
2138 { # Closure
2139     # Keep track of fields that are to be put into the constructor.
2140     my %constructor_fields;
2141
2142     main::setup_package(Constructor_Fields => \%constructor_fields);
2143
2144     my %file; # Input file name, required
2145     main::set_access('file', \%file, qw{ c r });
2146
2147     my %first_released; # Unicode version file was first released in, required
2148     main::set_access('first_released', \%first_released, qw{ c r });
2149
2150     my %handler;    # Subroutine to process the input file, defaults to
2151                     # 'process_generic_property_file'
2152     main::set_access('handler', \%handler, qw{ c });
2153
2154     my %property;
2155     # name of property this file is for.  defaults to none, meaning not
2156     # applicable, or is otherwise determinable, for example, from each line.
2157     main::set_access('property', \%property, qw{ c r });
2158
2159     my %optional;
2160     # If this is true, the file is optional.  If not present, no warning is
2161     # output.  If it is present, the string given by this parameter is
2162     # evaluated, and if false the file is not processed.
2163     main::set_access('optional', \%optional, 'c', 'r');
2164
2165     my %non_skip;
2166     # This is used for debugging, to skip processing of all but a few input
2167     # files.  Add 'non_skip => 1' to the constructor for those files you want
2168     # processed when you set the $debug_skip global.
2169     main::set_access('non_skip', \%non_skip, 'c');
2170
2171     my %skip;
2172     # This is used to skip processing of this input file semi-permanently,
2173     # when it evaluates to true.  The value should be the reason the file is
2174     # being skipped.  It is used for files that we aren't planning to process
2175     # anytime soon, but want to allow to be in the directory and not raise a
2176     # message that we are not handling.  Mostly for test files.  This is in
2177     # contrast to the non_skip element, which is supposed to be used very
2178     # temporarily for debugging.  Sets 'optional' to 1.  Also, files that we
2179     # pretty much will never look at can be placed in the global
2180     # %ignored_files instead.  Ones used here will be added to %skipped files
2181     main::set_access('skip', \%skip, 'c');
2182
2183     my %each_line_handler;
2184     # list of subroutines to look at and filter each non-comment line in the
2185     # file.  defaults to none.  The subroutines are called in order, each is
2186     # to adjust $_ for the next one, and the final one adjusts it for
2187     # 'handler'
2188     main::set_access('each_line_handler', \%each_line_handler, 'c');
2189
2190     my %properties; # Optional ordered list of the properties that occur in each
2191     # meaningful line of the input file.  If present, an appropriate
2192     # each_line_handler() is automatically generated and pushed onto the stack
2193     # of such handlers.  This is useful when a file contains multiple
2194     # proerties per line, but no other special considerations are necessary.
2195     # The special value "<ignored>" means to discard the corresponding input
2196     # field.
2197     # Any @missing lines in the file should also match this syntax; no such
2198     # files exist as of 6.3.  But if it happens in a future release, the code
2199     # could be expanded to properly parse them.
2200     main::set_access('properties', \%properties, qw{ c r });
2201
2202     my %has_missings_defaults;
2203     # ? Are there lines in the file giving default values for code points
2204     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2205     # the norm, but IGNORED means it has such lines, but the handler doesn't
2206     # use them.  Having these three states allows us to catch changes to the
2207     # UCD that this program should track.  XXX This could be expanded to
2208     # specify the syntax for such lines, like %properties above.
2209     main::set_access('has_missings_defaults',
2210                                         \%has_missings_defaults, qw{ c r });
2211
2212     my %pre_handler;
2213     # Subroutine to call before doing anything else in the file.  If undef, no
2214     # such handler is called.
2215     main::set_access('pre_handler', \%pre_handler, qw{ c });
2216
2217     my %eof_handler;
2218     # Subroutine to call upon getting an EOF on the input file, but before
2219     # that is returned to the main handler.  This is to allow buffers to be
2220     # flushed.  The handler is expected to call insert_lines() or
2221     # insert_adjusted() with the buffered material
2222     main::set_access('eof_handler', \%eof_handler, qw{ c r });
2223
2224     my %post_handler;
2225     # Subroutine to call after all the lines of the file are read in and
2226     # processed.  If undef, no such handler is called.
2227     main::set_access('post_handler', \%post_handler, qw{ c });
2228
2229     my %progress_message;
2230     # Message to print to display progress in lieu of the standard one
2231     main::set_access('progress_message', \%progress_message, qw{ c });
2232
2233     my %handle;
2234     # cache open file handle, internal.  Is undef if file hasn't been
2235     # processed at all, empty if has;
2236     main::set_access('handle', \%handle);
2237
2238     my %added_lines;
2239     # cache of lines added virtually to the file, internal
2240     main::set_access('added_lines', \%added_lines);
2241
2242     my %remapped_lines;
2243     # cache of lines added virtually to the file, internal
2244     main::set_access('remapped_lines', \%remapped_lines);
2245
2246     my %errors;
2247     # cache of errors found, internal
2248     main::set_access('errors', \%errors);
2249
2250     my %missings;
2251     # storage of '@missing' defaults lines
2252     main::set_access('missings', \%missings);
2253
2254     sub _next_line;
2255     sub _next_line_with_remapped_range;
2256
2257     sub new {
2258         my $class = shift;
2259
2260         my $self = bless \do{ my $anonymous_scalar }, $class;
2261         my $addr = do { no overloading; pack 'J', $self; };
2262
2263         # Set defaults
2264         $handler{$addr} = \&main::process_generic_property_file;
2265         $non_skip{$addr} = 0;
2266         $skip{$addr} = 0;
2267         $has_missings_defaults{$addr} = $NO_DEFAULTS;
2268         $handle{$addr} = undef;
2269         $added_lines{$addr} = [ ];
2270         $remapped_lines{$addr} = [ ];
2271         $each_line_handler{$addr} = [ ];
2272         $errors{$addr} = { };
2273         $missings{$addr} = [ ];
2274
2275         # Two positional parameters.
2276         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2277         $file{$addr} = main::internal_file_to_platform(shift);
2278         $first_released{$addr} = shift;
2279
2280         # The rest of the arguments are key => value pairs
2281         # %constructor_fields has been set up earlier to list all possible
2282         # ones.  Either set or push, depending on how the default has been set
2283         # up just above.
2284         my %args = @_;
2285         foreach my $key (keys %args) {
2286             my $argument = $args{$key};
2287
2288             # Note that the fields are the lower case of the constructor keys
2289             my $hash = $constructor_fields{lc $key};
2290             if (! defined $hash) {
2291                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2292                 next;
2293             }
2294             if (ref $hash->{$addr} eq 'ARRAY') {
2295                 if (ref $argument eq 'ARRAY') {
2296                     foreach my $argument (@{$argument}) {
2297                         next if ! defined $argument;
2298                         push @{$hash->{$addr}}, $argument;
2299                     }
2300                 }
2301                 else {
2302                     push @{$hash->{$addr}}, $argument if defined $argument;
2303                 }
2304             }
2305             else {
2306                 $hash->{$addr} = $argument;
2307             }
2308             delete $args{$key};
2309         };
2310
2311         # If the file has a property for it, it means that the property is not
2312         # listed in the file's entries.  So add a handler to the list of line
2313         # handlers to insert the property name into the lines, to provide a
2314         # uniform interface to the final processing subroutine.
2315         # the final code doesn't have to worry about that.
2316         if ($property{$addr}) {
2317             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2318         }
2319
2320         if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2321             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2322         }
2323
2324         # If skipping, set to optional, and add to list of ignored files,
2325         # including its reason
2326         if ($skip{$addr}) {
2327             $optional{$addr} = 1;
2328             $skipped_files{$file{$addr}} = $skip{$addr}
2329         }
2330         elsif ($properties{$addr}) {
2331
2332             # Add a handler for each line in the input so that it creates a
2333             # separate input line for each property in those input lines, thus
2334             # making them suitable for process_generic_property_file().
2335
2336             push @{$each_line_handler{$addr}},
2337                  sub {
2338                     my $file = shift;
2339                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2340
2341                     my @fields = split /\s*;\s*/, $_, -1;
2342
2343                     if (@fields - 1 > @{$properties{$addr}}) {
2344                         $file->carp_bad_line('Extra fields');
2345                         $_ = "";
2346                         return;
2347                     }
2348                     my $range = shift @fields;  # 0th element is always the
2349                                                 # range
2350
2351                     # The next fields in the input line correspond
2352                     # respectively to the stored properties.
2353                     for my $i (0 ..  @{$properties{$addr}} - 1) {
2354                         my $property_name = $properties{$addr}[$i];
2355                         next if $property_name eq '<ignored>';
2356                         $file->insert_adjusted_lines(
2357                               "$range; $property_name; $fields[$i]");
2358                     }
2359                     $_ = "";
2360
2361                     return;
2362                 };
2363         }
2364
2365         {   # On non-ascii platforms, we use a special handler
2366             no strict;
2367             no warnings 'once';
2368             *next_line = (main::NON_ASCII_PLATFORM)
2369                          ? *_next_line_with_remapped_range
2370                          : *_next_line;
2371         }
2372
2373         return $self;
2374     }
2375
2376
2377     use overload
2378         fallback => 0,
2379         qw("") => "_operator_stringify",
2380         "." => \&main::_operator_dot,
2381         ".=" => \&main::_operator_dot_equal,
2382     ;
2383
2384     sub _operator_stringify {
2385         my $self = shift;
2386
2387         return __PACKAGE__ . " object for " . $self->file;
2388     }
2389
2390     # flag to make sure extracted files are processed early
2391     my $seen_non_extracted_non_age = 0;
2392
2393     sub run {
2394         # Process the input object $self.  This opens and closes the file and
2395         # calls all the handlers for it.  Currently,  this can only be called
2396         # once per file, as it destroy's the EOF handler
2397
2398         my $self = shift;
2399         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2400
2401         my $addr = do { no overloading; pack 'J', $self; };
2402
2403         my $file = $file{$addr};
2404
2405         # Don't process if not expecting this file (because released later
2406         # than this Unicode version), and isn't there.  This means if someone
2407         # copies it into an earlier version's directory, we will go ahead and
2408         # process it.
2409         return if $first_released{$addr} gt $v_version && ! -e $file;
2410
2411         # If in debugging mode and this file doesn't have the non-skip
2412         # flag set, and isn't one of the critical files, skip it.
2413         if ($debug_skip
2414             && $first_released{$addr} ne v0
2415             && ! $non_skip{$addr})
2416         {
2417             print "Skipping $file in debugging\n" if $verbosity;
2418             return;
2419         }
2420
2421         # File could be optional
2422         if ($optional{$addr}) {
2423             return unless -e $file;
2424             my $result = eval $optional{$addr};
2425             if (! defined $result) {
2426                 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}.  $file Skipped.");
2427                 return;
2428             }
2429             if (! $result) {
2430                 if ($verbosity) {
2431                     print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2432                 }
2433                 return;
2434             }
2435         }
2436
2437         if (! defined $file || ! -e $file) {
2438
2439             # If the file doesn't exist, see if have internal data for it
2440             # (based on first_released being 0).
2441             if ($first_released{$addr} eq v0) {
2442                 $handle{$addr} = 'pretend_is_open';
2443             }
2444             else {
2445                 if (! $optional{$addr}  # File could be optional
2446                     && $v_version ge $first_released{$addr})
2447                 {
2448                     print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2449                 }
2450                 return;
2451             }
2452         }
2453         else {
2454
2455             # Here, the file exists.  Some platforms may change the case of
2456             # its name
2457             if ($seen_non_extracted_non_age) {
2458                 if ($file =~ /$EXTRACTED/i) {
2459                     Carp::my_carp_bug(main::join_lines(<<END
2460 $file should be processed just after the 'Prop...Alias' files, and before
2461 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2462 have subtle problems
2463 END
2464                     ));
2465                 }
2466             }
2467             elsif ($EXTRACTED_DIR
2468                     && $first_released{$addr} ne v0
2469                     && $file !~ /$EXTRACTED/i
2470                     && lc($file) ne 'dage.txt')
2471             {
2472                 # We don't set this (by the 'if' above) if we have no
2473                 # extracted directory, so if running on an early version,
2474                 # this test won't work.  Not worth worrying about.
2475                 $seen_non_extracted_non_age = 1;
2476             }
2477
2478             # And mark the file as having being processed, and warn if it
2479             # isn't a file we are expecting.  As we process the files,
2480             # they are deleted from the hash, so any that remain at the
2481             # end of the program are files that we didn't process.
2482             my $fkey = File::Spec->rel2abs($file);
2483             my $expecting = delete $potential_files{lc($fkey)};
2484
2485             Carp::my_carp("Was not expecting '$file'.") if
2486                     ! $expecting
2487                     && ! defined $handle{$addr};
2488
2489             # Having deleted from expected files, we can quit if not to do
2490             # anything.  Don't print progress unless really want verbosity
2491             if ($skip{$addr}) {
2492                 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2493                 return;
2494             }
2495
2496             # Open the file, converting the slashes used in this program
2497             # into the proper form for the OS
2498             my $file_handle;
2499             if (not open $file_handle, "<", $file) {
2500                 Carp::my_carp("Can't open $file.  Skipping: $!");
2501                 return 0;
2502             }
2503             $handle{$addr} = $file_handle; # Cache the open file handle
2504
2505             if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
2506                 $_ = <$file_handle>;
2507                 if ($_ !~ / - $string_version \. /x) {
2508                     chomp;
2509                     $_ =~ s/^#\s*//;
2510                     die Carp::my_carp("File '$file' is version '$_'.  It should be version $string_version");
2511                 }
2512             }
2513         }
2514
2515         if ($verbosity >= $PROGRESS) {
2516             if ($progress_message{$addr}) {
2517                 print "$progress_message{$addr}\n";
2518             }
2519             else {
2520                 # If using a virtual file, say so.
2521                 print "Processing ", (-e $file)
2522                                        ? $file
2523                                        : "substitute $file",
2524                                      "\n";
2525             }
2526         }
2527
2528
2529         # Call any special handler for before the file.
2530         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2531
2532         # Then the main handler
2533         &{$handler{$addr}}($self);
2534
2535         # Then any special post-file handler.
2536         &{$post_handler{$addr}}($self) if $post_handler{$addr};
2537
2538         # If any errors have been accumulated, output the counts (as the first
2539         # error message in each class was output when it was encountered).
2540         if ($errors{$addr}) {
2541             my $total = 0;
2542             my $types = 0;
2543             foreach my $error (keys %{$errors{$addr}}) {
2544                 $total += $errors{$addr}->{$error};
2545                 delete $errors{$addr}->{$error};
2546                 $types++;
2547             }
2548             if ($total > 1) {
2549                 my $message
2550                         = "A total of $total lines had errors in $file.  ";
2551
2552                 $message .= ($types == 1)
2553                             ? '(Only the first one was displayed.)'
2554                             : '(Only the first of each type was displayed.)';
2555                 Carp::my_carp($message);
2556             }
2557         }
2558
2559         if (@{$missings{$addr}}) {
2560             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2561         }
2562
2563         # If a real file handle, close it.
2564         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2565                                                         ref $handle{$addr};
2566         $handle{$addr} = "";   # Uses empty to indicate that has already seen
2567                                # the file, as opposed to undef
2568         return;
2569     }
2570
2571     sub _next_line {
2572         # Sets $_ to be the next logical input line, if any.  Returns non-zero
2573         # if such a line exists.  'logical' means that any lines that have
2574         # been added via insert_lines() will be returned in $_ before the file
2575         # is read again.
2576
2577         my $self = shift;
2578         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2579
2580         my $addr = do { no overloading; pack 'J', $self; };
2581
2582         # Here the file is open (or if the handle is not a ref, is an open
2583         # 'virtual' file).  Get the next line; any inserted lines get priority
2584         # over the file itself.
2585         my $adjusted;
2586
2587         LINE:
2588         while (1) { # Loop until find non-comment, non-empty line
2589             #local $to_trace = 1 if main::DEBUG;
2590             my $inserted_ref = shift @{$added_lines{$addr}};
2591             if (defined $inserted_ref) {
2592                 ($adjusted, $_) = @{$inserted_ref};
2593                 trace $adjusted, $_ if main::DEBUG && $to_trace;
2594                 return 1 if $adjusted;
2595             }
2596             else {
2597                 last if ! ref $handle{$addr}; # Don't read unless is real file
2598                 last if ! defined ($_ = readline $handle{$addr});
2599             }
2600             chomp;
2601             trace $_ if main::DEBUG && $to_trace;
2602
2603             # See if this line is the comment line that defines what property
2604             # value that code points that are not listed in the file should
2605             # have.  The format or existence of these lines is not guaranteed
2606             # by Unicode since they are comments, but the documentation says
2607             # that this was added for machine-readability, so probably won't
2608             # change.  This works starting in Unicode Version 5.0.  They look
2609             # like:
2610             #
2611             # @missing: 0000..10FFFF; Not_Reordered
2612             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2613             # @missing: 0000..10FFFF; ; NaN
2614             #
2615             # Save the line for a later get_missings() call.
2616             if (/$missing_defaults_prefix/) {
2617                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2618                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2619                 }
2620                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2621                     my @defaults = split /\s* ; \s*/x, $_;
2622
2623                     # The first field is the @missing, which ends in a
2624                     # semi-colon, so can safely shift.
2625                     shift @defaults;
2626
2627                     # Some of these lines may have empty field placeholders
2628                     # which get in the way.  An example is:
2629                     # @missing: 0000..10FFFF; ; NaN
2630                     # Remove them.  Process starting from the top so the
2631                     # splice doesn't affect things still to be looked at.
2632                     for (my $i = @defaults - 1; $i >= 0; $i--) {
2633                         next if $defaults[$i] ne "";
2634                         splice @defaults, $i, 1;
2635                     }
2636
2637                     # What's left should be just the property (maybe) and the
2638                     # default.  Having only one element means it doesn't have
2639                     # the property.
2640                     my $default;
2641                     my $property;
2642                     if (@defaults >= 1) {
2643                         if (@defaults == 1) {
2644                             $default = $defaults[0];
2645                         }
2646                         else {
2647                             $property = $defaults[0];
2648                             $default = $defaults[1];
2649                         }
2650                     }
2651
2652                     if (@defaults < 1
2653                         || @defaults > 2
2654                         || ($default =~ /^</
2655                             && $default !~ /^<code *point>$/i
2656                             && $default !~ /^<none>$/i
2657                             && $default !~ /^<script>$/i))
2658                     {
2659                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2660                     }
2661                     else {
2662
2663                         # If the property is missing from the line, it should
2664                         # be the one for the whole file
2665                         $property = $property{$addr} if ! defined $property;
2666
2667                         # Change <none> to the null string, which is what it
2668                         # really means.  If the default is the code point
2669                         # itself, set it to <code point>, which is what
2670                         # Unicode uses (but sometimes they've forgotten the
2671                         # space)
2672                         if ($default =~ /^<none>$/i) {
2673                             $default = "";
2674                         }
2675                         elsif ($default =~ /^<code *point>$/i) {
2676                             $default = $CODE_POINT;
2677                         }
2678                         elsif ($default =~ /^<script>$/i) {
2679
2680                             # Special case this one.  Currently is from
2681                             # ScriptExtensions.txt, and means for all unlisted
2682                             # code points, use their Script property values.
2683                             # For the code points not listed in that file, the
2684                             # default value is 'Unknown'.
2685                             $default = "Unknown";
2686                         }
2687
2688                         # Store them as a sub-arrays with both components.
2689                         push @{$missings{$addr}}, [ $default, $property ];
2690                     }
2691                 }
2692
2693                 # There is nothing for the caller to process on this comment
2694                 # line.
2695                 next;
2696             }
2697
2698             # Remove comments and trailing space, and skip this line if the
2699             # result is empty
2700             s/#.*//;
2701             s/\s+$//;
2702             next if /^$/;
2703
2704             # Call any handlers for this line, and skip further processing of
2705             # the line if the handler sets the line to null.
2706             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2707                 &{$sub_ref}($self);
2708                 next LINE if /^$/;
2709             }
2710
2711             # Here the line is ok.  return success.
2712             return 1;
2713         } # End of looping through lines.
2714
2715         # If there is an EOF handler, call it (only once) and if it generates
2716         # more lines to process go back in the loop to handle them.
2717         if ($eof_handler{$addr}) {
2718             &{$eof_handler{$addr}}($self);
2719             $eof_handler{$addr} = "";   # Currently only get one shot at it.
2720             goto LINE if $added_lines{$addr};
2721         }
2722
2723         # Return failure -- no more lines.
2724         return 0;
2725
2726     }
2727
2728     sub _next_line_with_remapped_range {
2729         my $self = shift;
2730         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2731
2732         # like _next_line(), but for use on non-ASCII platforms.  It sets $_
2733         # to be the next logical input line, if any.  Returns non-zero if such
2734         # a line exists.  'logical' means that any lines that have been added
2735         # via insert_lines() will be returned in $_ before the file is read
2736         # again.
2737         #
2738         # The difference from _next_line() is that this remaps the Unicode
2739         # code points in the input to those of the native platform.  Each
2740         # input line contains a single code point, or a single contiguous
2741         # range of them  This routine splits each range into its individual
2742         # code points and caches them.  It returns the cached values,
2743         # translated into their native equivalents, one at a time, for each
2744         # call, before reading the next line.  Since native values can only be
2745         # a single byte wide, no translation is needed for code points above
2746         # 0xFF, and ranges that are entirely above that number are not split.
2747         # If an input line contains the range 254-1000, it would be split into
2748         # three elements: 254, 255, and 256-1000.  (The downstream table
2749         # insertion code will sort and coalesce the individual code points
2750         # into appropriate ranges.)
2751
2752         my $addr = do { no overloading; pack 'J', $self; };
2753
2754         while (1) {
2755
2756             # Look in cache before reading the next line.  Return any cached
2757             # value, translated
2758             my $inserted = shift @{$remapped_lines{$addr}};
2759             if (defined $inserted) {
2760                 trace $inserted if main::DEBUG && $to_trace;
2761                 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
2762                 trace $_ if main::DEBUG && $to_trace;
2763                 return 1;
2764             }
2765
2766             # Get the next line.
2767             return 0 unless _next_line($self);
2768
2769             # If there is a special handler for it, return the line,
2770             # untranslated.  This should happen only for files that are
2771             # special, not being code-point related, such as property names.
2772             return 1 if $handler{$addr}
2773                                     != \&main::process_generic_property_file;
2774
2775             my ($range, $property_name, $map, @remainder)
2776                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
2777
2778             if (@remainder
2779                 || ! defined $property_name
2780                 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
2781             {
2782                 Carp::my_carp_bug("Unrecognized input line '$_'.  Ignored");
2783             }
2784
2785             my $low = hex $1;
2786             my $high = (defined $2) ? hex $2 : $low;
2787
2788             # If the input maps the range to another code point, remap the
2789             # target if it is between 0 and 255.
2790             my $tail;
2791             if (defined $map) {
2792                 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
2793                 $tail = "$property_name; $map";
2794                 $_ = "$range; $tail";
2795             }
2796             else {
2797                 $tail = $property_name;
2798             }
2799
2800             # If entire range is above 255, just return it, unchanged (except
2801             # any mapped-to code point, already changed above)
2802             return 1 if $low > 255;
2803
2804             # Cache an entry for every code point < 255.  For those in the
2805             # range above 255, return a dummy entry for just that portion of
2806             # the range.  Note that this will be out-of-order, but that is not
2807             # a problem.
2808             foreach my $code_point ($low .. $high) {
2809                 if ($code_point > 255) {
2810                     $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
2811                     return 1;
2812                 }
2813                 push @{$remapped_lines{$addr}}, "$code_point; $tail";
2814             }
2815         } # End of looping through lines.
2816
2817         # NOTREACHED
2818     }
2819
2820 #   Not currently used, not fully tested.
2821 #    sub peek {
2822 #        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2823 #        # record.  Not callable from an each_line_handler(), nor does it call
2824 #        # an each_line_handler() on the line.
2825 #
2826 #        my $self = shift;
2827 #        my $addr = do { no overloading; pack 'J', $self; };
2828 #
2829 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
2830 #            my ($adjusted, $line) = @{$inserted_ref};
2831 #            next if $adjusted;
2832 #
2833 #            # Remove comments and trailing space, and return a non-empty
2834 #            # resulting line
2835 #            $line =~ s/#.*//;
2836 #            $line =~ s/\s+$//;
2837 #            return $line if $line ne "";
2838 #        }
2839 #
2840 #        return if ! ref $handle{$addr}; # Don't read unless is real file
2841 #        while (1) { # Loop until find non-comment, non-empty line
2842 #            local $to_trace = 1 if main::DEBUG;
2843 #            trace $_ if main::DEBUG && $to_trace;
2844 #            return if ! defined (my $line = readline $handle{$addr});
2845 #            chomp $line;
2846 #            push @{$added_lines{$addr}}, [ 0, $line ];
2847 #
2848 #            $line =~ s/#.*//;
2849 #            $line =~ s/\s+$//;
2850 #            return $line if $line ne "";
2851 #        }
2852 #
2853 #        return;
2854 #    }
2855
2856
2857     sub insert_lines {
2858         # Lines can be inserted so that it looks like they were in the input
2859         # file at the place it was when this routine is called.  See also
2860         # insert_adjusted_lines().  Lines inserted via this routine go through
2861         # any each_line_handler()
2862
2863         my $self = shift;
2864
2865         # Each inserted line is an array, with the first element being 0 to
2866         # indicate that this line hasn't been adjusted, and needs to be
2867         # processed.
2868         no overloading;
2869         push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2870         return;
2871     }
2872
2873     sub insert_adjusted_lines {
2874         # Lines can be inserted so that it looks like they were in the input
2875         # file at the place it was when this routine is called.  See also
2876         # insert_lines().  Lines inserted via this routine are already fully
2877         # adjusted, ready to be processed; each_line_handler()s handlers will
2878         # not be called.  This means this is not a completely general
2879         # facility, as only the last each_line_handler on the stack should
2880         # call this.  It could be made more general, by passing to each of the
2881         # line_handlers their position on the stack, which they would pass on
2882         # to this routine, and that would replace the boolean first element in
2883         # the anonymous array pushed here, so that the next_line routine could
2884         # use that to call only those handlers whose index is after it on the
2885         # stack.  But this is overkill for what is needed now.
2886
2887         my $self = shift;
2888         trace $_[0] if main::DEBUG && $to_trace;
2889
2890         # Each inserted line is an array, with the first element being 1 to
2891         # indicate that this line has been adjusted
2892         no overloading;
2893         push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2894         return;
2895     }
2896
2897     sub get_missings {
2898         # Returns the stored up @missings lines' values, and clears the list.
2899         # The values are in an array, consisting of the default in the first
2900         # element, and the property in the 2nd.  However, since these lines
2901         # can be stacked up, the return is an array of all these arrays.
2902
2903         my $self = shift;
2904         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2905
2906         my $addr = do { no overloading; pack 'J', $self; };
2907
2908         # If not accepting a list return, just return the first one.
2909         return shift @{$missings{$addr}} unless wantarray;
2910
2911         my @return = @{$missings{$addr}};
2912         undef @{$missings{$addr}};
2913         return @return;
2914     }
2915
2916     sub _insert_property_into_line {
2917         # Add a property field to $_, if this file requires it.
2918
2919         my $self = shift;
2920         my $addr = do { no overloading; pack 'J', $self; };
2921         my $property = $property{$addr};
2922         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2923
2924         $_ =~ s/(;|$)/; $property$1/;
2925         return;
2926     }
2927
2928     sub carp_bad_line {
2929         # Output consistent error messages, using either a generic one, or the
2930         # one given by the optional parameter.  To avoid gazillions of the
2931         # same message in case the syntax of a  file is way off, this routine
2932         # only outputs the first instance of each message, incrementing a
2933         # count so the totals can be output at the end of the file.
2934
2935         my $self = shift;
2936         my $message = shift;
2937         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2938
2939         my $addr = do { no overloading; pack 'J', $self; };
2940
2941         $message = 'Unexpected line' unless $message;
2942
2943         # No trailing punctuation so as to fit with our addenda.
2944         $message =~ s/[.:;,]$//;
2945
2946         # If haven't seen this exact message before, output it now.  Otherwise
2947         # increment the count of how many times it has occurred
2948         unless ($errors{$addr}->{$message}) {
2949             Carp::my_carp("$message in '$_' in "
2950                             . $file{$addr}
2951                             . " at line $..  Skipping this line;");
2952             $errors{$addr}->{$message} = 1;
2953         }
2954         else {
2955             $errors{$addr}->{$message}++;
2956         }
2957
2958         # Clear the line to prevent any further (meaningful) processing of it.
2959         $_ = "";
2960
2961         return;
2962     }
2963 } # End closure
2964
2965 package Multi_Default;
2966
2967 # Certain properties in early versions of Unicode had more than one possible
2968 # default for code points missing from the files.  In these cases, one
2969 # default applies to everything left over after all the others are applied,
2970 # and for each of the others, there is a description of which class of code
2971 # points applies to it.  This object helps implement this by storing the
2972 # defaults, and for all but that final default, an eval string that generates
2973 # the class that it applies to.
2974
2975
2976 {   # Closure
2977
2978     main::setup_package();
2979
2980     my %class_defaults;
2981     # The defaults structure for the classes
2982     main::set_access('class_defaults', \%class_defaults);
2983
2984     my %other_default;
2985     # The default that applies to everything left over.
2986     main::set_access('other_default', \%other_default, 'r');
2987
2988
2989     sub new {
2990         # The constructor is called with default => eval pairs, terminated by
2991         # the left-over default. e.g.
2992         # Multi_Default->new(
2993         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2994         #               -  0x200D',
2995         #        'R' => 'some other expression that evaluates to code points',
2996         #        .
2997         #        .
2998         #        .
2999         #        'U'));
3000
3001         my $class = shift;
3002
3003         my $self = bless \do{my $anonymous_scalar}, $class;
3004         my $addr = do { no overloading; pack 'J', $self; };
3005
3006         while (@_ > 1) {
3007             my $default = shift;
3008             my $eval = shift;
3009             $class_defaults{$addr}->{$default} = $eval;
3010         }
3011
3012         $other_default{$addr} = shift;
3013
3014         return $self;
3015     }
3016
3017     sub get_next_defaults {
3018         # Iterates and returns the next class of defaults.
3019         my $self = shift;
3020         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3021
3022         my $addr = do { no overloading; pack 'J', $self; };
3023
3024         return each %{$class_defaults{$addr}};
3025     }
3026 }
3027
3028 package Alias;
3029
3030 # An alias is one of the names that a table goes by.  This class defines them
3031 # including some attributes.  Everything is currently setup in the
3032 # constructor.
3033
3034
3035 {   # Closure
3036
3037     main::setup_package();
3038
3039     my %name;
3040     main::set_access('name', \%name, 'r');
3041
3042     my %loose_match;
3043     # Should this name match loosely or not.
3044     main::set_access('loose_match', \%loose_match, 'r');
3045
3046     my %make_re_pod_entry;
3047     # Some aliases should not get their own entries in the re section of the
3048     # pod, because they are covered by a wild-card, and some we want to
3049     # discourage use of.  Binary
3050     main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3051
3052     my %ucd;
3053     # Is this documented to be accessible via Unicode::UCD
3054     main::set_access('ucd', \%ucd, 'r', 's');
3055
3056     my %status;
3057     # Aliases have a status, like deprecated, or even suppressed (which means
3058     # they don't appear in documentation).  Enum
3059     main::set_access('status', \%status, 'r');
3060
3061     my %ok_as_filename;
3062     # Similarly, some aliases should not be considered as usable ones for
3063     # external use, such as file names, or we don't want documentation to
3064     # recommend them.  Boolean
3065     main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3066
3067     sub new {
3068         my $class = shift;
3069
3070         my $self = bless \do { my $anonymous_scalar }, $class;
3071         my $addr = do { no overloading; pack 'J', $self; };
3072
3073         $name{$addr} = shift;
3074         $loose_match{$addr} = shift;
3075         $make_re_pod_entry{$addr} = shift;
3076         $ok_as_filename{$addr} = shift;
3077         $status{$addr} = shift;
3078         $ucd{$addr} = shift;
3079
3080         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3081
3082         # Null names are never ok externally
3083         $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3084
3085         return $self;
3086     }
3087 }
3088
3089 package Range;
3090
3091 # A range is the basic unit for storing code points, and is described in the
3092 # comments at the beginning of the program.  Each range has a starting code
3093 # point; an ending code point (not less than the starting one); a value
3094 # that applies to every code point in between the two end-points, inclusive;
3095 # and an enum type that applies to the value.  The type is for the user's
3096 # convenience, and has no meaning here, except that a non-zero type is
3097 # considered to not obey the normal Unicode rules for having standard forms.
3098 #
3099 # The same structure is used for both map and match tables, even though in the
3100 # latter, the value (and hence type) is irrelevant and could be used as a
3101 # comment.  In map tables, the value is what all the code points in the range
3102 # map to.  Type 0 values have the standardized version of the value stored as
3103 # well, so as to not have to recalculate it a lot.
3104
3105 sub trace { return main::trace(@_); }
3106
3107 {   # Closure
3108
3109     main::setup_package();
3110
3111     my %start;
3112     main::set_access('start', \%start, 'r', 's');
3113
3114     my %end;
3115     main::set_access('end', \%end, 'r', 's');
3116
3117     my %value;
3118     main::set_access('value', \%value, 'r');
3119
3120     my %type;
3121     main::set_access('type', \%type, 'r');
3122
3123     my %standard_form;
3124     # The value in internal standard form.  Defined only if the type is 0.
3125     main::set_access('standard_form', \%standard_form);
3126
3127     # Note that if these fields change, the dump() method should as well
3128
3129     sub new {
3130         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
3131         my $class = shift;
3132
3133         my $self = bless \do { my $anonymous_scalar }, $class;
3134         my $addr = do { no overloading; pack 'J', $self; };
3135
3136         $start{$addr} = shift;
3137         $end{$addr} = shift;
3138
3139         my %args = @_;
3140
3141         my $value = delete $args{'Value'};  # Can be 0
3142         $value = "" unless defined $value;
3143         $value{$addr} = $value;
3144
3145         $type{$addr} = delete $args{'Type'} || 0;
3146
3147         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3148
3149         return $self;
3150     }
3151
3152     use overload
3153         fallback => 0,
3154         qw("") => "_operator_stringify",
3155         "." => \&main::_operator_dot,
3156         ".=" => \&main::_operator_dot_equal,
3157     ;
3158
3159     sub _operator_stringify {
3160         my $self = shift;
3161         my $addr = do { no overloading; pack 'J', $self; };
3162
3163         # Output it like '0041..0065 (value)'
3164         my $return = sprintf("%04X", $start{$addr})
3165                         .  '..'
3166                         . sprintf("%04X", $end{$addr});
3167         my $value = $value{$addr};
3168         my $type = $type{$addr};
3169         $return .= ' (';
3170         $return .= "$value";
3171         $return .= ", Type=$type" if $type != 0;
3172         $return .= ')';
3173
3174         return $return;
3175     }
3176
3177     sub standard_form {
3178         # Calculate the standard form only if needed, and cache the result.
3179         # The standard form is the value itself if the type is special.
3180         # This represents a considerable CPU and memory saving - at the time
3181         # of writing there are 368676 non-special objects, but the standard
3182         # form is only requested for 22047 of them - ie about 6%.
3183
3184         my $self = shift;
3185         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3186
3187         my $addr = do { no overloading; pack 'J', $self; };
3188
3189         return $standard_form{$addr} if defined $standard_form{$addr};
3190
3191         my $value = $value{$addr};
3192         return $value if $type{$addr};
3193         return $standard_form{$addr} = main::standardize($value);
3194     }
3195
3196     sub dump {
3197         # Human, not machine readable.  For machine readable, comment out this
3198         # entire routine and let the standard one take effect.
3199         my $self = shift;
3200         my $indent = shift;
3201         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3202
3203         my $addr = do { no overloading; pack 'J', $self; };
3204
3205         my $return = $indent
3206                     . sprintf("%04X", $start{$addr})
3207                     . '..'
3208                     . sprintf("%04X", $end{$addr})
3209                     . " '$value{$addr}';";
3210         if (! defined $standard_form{$addr}) {
3211             $return .= "(type=$type{$addr})";
3212         }
3213         elsif ($standard_form{$addr} ne $value{$addr}) {
3214             $return .= "(standard '$standard_form{$addr}')";
3215         }
3216         return $return;
3217     }
3218 } # End closure
3219
3220 package _Range_List_Base;
3221
3222 # Base class for range lists.  A range list is simply an ordered list of
3223 # ranges, so that the ranges with the lowest starting numbers are first in it.
3224 #
3225 # When a new range is added that is adjacent to an existing range that has the
3226 # same value and type, it merges with it to form a larger range.
3227 #
3228 # Ranges generally do not overlap, except that there can be multiple entries
3229 # of single code point ranges.  This is because of NameAliases.txt.
3230 #
3231 # In this program, there is a standard value such that if two different
3232 # values, have the same standard value, they are considered equivalent.  This
3233 # value was chosen so that it gives correct results on Unicode data
3234
3235 # There are a number of methods to manipulate range lists, and some operators
3236 # are overloaded to handle them.
3237
3238 sub trace { return main::trace(@_); }
3239
3240 { # Closure
3241
3242     our $addr;
3243
3244     # Max is initialized to a negative value that isn't adjacent to 0, for
3245     # simpler tests
3246     my $max_init = -2;
3247
3248     main::setup_package();
3249
3250     my %ranges;
3251     # The list of ranges
3252     main::set_access('ranges', \%ranges, 'readable_array');
3253
3254     my %max;
3255     # The highest code point in the list.  This was originally a method, but
3256     # actual measurements said it was used a lot.
3257     main::set_access('max', \%max, 'r');
3258
3259     my %each_range_iterator;
3260     # Iterator position for each_range()
3261     main::set_access('each_range_iterator', \%each_range_iterator);
3262
3263     my %owner_name_of;
3264     # Name of parent this is attached to, if any.  Solely for better error
3265     # messages.
3266     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3267
3268     my %_search_ranges_cache;
3269     # A cache of the previous result from _search_ranges(), for better
3270     # performance
3271     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3272
3273     sub new {
3274         my $class = shift;
3275         my %args = @_;
3276
3277         # Optional initialization data for the range list.
3278         my $initialize = delete $args{'Initialize'};
3279
3280         my $self;
3281
3282         # Use _union() to initialize.  _union() returns an object of this
3283         # class, which means that it will call this constructor recursively.
3284         # But it won't have this $initialize parameter so that it won't
3285         # infinitely loop on this.
3286         return _union($class, $initialize, %args) if defined $initialize;
3287
3288         $self = bless \do { my $anonymous_scalar }, $class;
3289         my $addr = do { no overloading; pack 'J', $self; };
3290
3291         # Optional parent object, only for debug info.
3292         $owner_name_of{$addr} = delete $args{'Owner'};
3293         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3294
3295         # Stringify, in case it is an object.
3296         $owner_name_of{$addr} = "$owner_name_of{$addr}";
3297
3298         # This is used only for error messages, and so a colon is added
3299         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3300
3301         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3302
3303         $max{$addr} = $max_init;
3304
3305         $_search_ranges_cache{$addr} = 0;
3306         $ranges{$addr} = [];
3307
3308         return $self;
3309     }
3310
3311     use overload
3312         fallback => 0,
3313         qw("") => "_operator_stringify",
3314         "." => \&main::_operator_dot,
3315         ".=" => \&main::_operator_dot_equal,
3316     ;
3317
3318     sub _operator_stringify {
3319         my $self = shift;
3320         my $addr = do { no overloading; pack 'J', $self; };
3321
3322         return "Range_List attached to '$owner_name_of{$addr}'"
3323                                                 if $owner_name_of{$addr};
3324         return "anonymous Range_List " . \$self;
3325     }
3326
3327     sub _union {
3328         # Returns the union of the input code points.  It can be called as
3329         # either a constructor or a method.  If called as a method, the result
3330         # will be a new() instance of the calling object, containing the union
3331         # of that object with the other parameter's code points;  if called as
3332         # a constructor, the first parameter gives the class that the new object
3333         # should be, and the second parameter gives the code points to go into
3334         # it.
3335         # In either case, there are two parameters looked at by this routine;
3336         # any additional parameters are passed to the new() constructor.
3337         #
3338         # The code points can come in the form of some object that contains
3339         # ranges, and has a conventionally named method to access them; or
3340         # they can be an array of individual code points (as integers); or
3341         # just a single code point.
3342         #
3343         # If they are ranges, this routine doesn't make any effort to preserve
3344         # the range values and types of one input over the other.  Therefore
3345         # this base class should not allow _union to be called from other than
3346         # initialization code, so as to prevent two tables from being added
3347         # together where the range values matter.  The general form of this
3348         # routine therefore belongs in a derived class, but it was moved here
3349         # to avoid duplication of code.  The failure to overload this in this
3350         # class keeps it safe.
3351         #
3352         # It does make the effort during initialization to accept tables with
3353         # multiple values for the same code point, and to preserve the order
3354         # of these.  If there is only one input range or range set, it doesn't
3355         # sort (as it should already be sorted to the desired order), and will
3356         # accept multiple values per code point.  Otherwise it will merge
3357         # multiple values into a single one.
3358
3359         my $self;
3360         my @args;   # Arguments to pass to the constructor
3361
3362         my $class = shift;
3363
3364         # If a method call, will start the union with the object itself, and
3365         # the class of the new object will be the same as self.
3366         if (ref $class) {
3367             $self = $class;
3368             $class = ref $self;
3369             push @args, $self;
3370         }
3371
3372         # Add the other required parameter.
3373         push @args, shift;
3374         # Rest of parameters are passed on to the constructor
3375
3376         # Accumulate all records from both lists.
3377         my @records;
3378         my $input_count = 0;
3379         for my $arg (@args) {
3380             #local $to_trace = 0 if main::DEBUG;
3381             trace "argument = $arg" if main::DEBUG && $to_trace;
3382             if (! defined $arg) {
3383                 my $message = "";
3384                 if (defined $self) {
3385                     no overloading;
3386                     $message .= $owner_name_of{pack 'J', $self};
3387                 }
3388                 Carp::my_carp_bug($message . "Undefined argument to _union.  No union done.");
3389                 return;
3390             }
3391
3392             $arg = [ $arg ] if ! ref $arg;
3393             my $type = ref $arg;
3394             if ($type eq 'ARRAY') {
3395                 foreach my $element (@$arg) {
3396                     push @records, Range->new($element, $element);
3397                     $input_count++;
3398                 }
3399             }
3400             elsif ($arg->isa('Range')) {
3401                 push @records, $arg;
3402                 $input_count++;
3403             }
3404             elsif ($arg->can('ranges')) {
3405                 push @records, $arg->ranges;
3406                 $input_count++;
3407             }
3408             else {
3409                 my $message = "";
3410                 if (defined $self) {
3411                     no overloading;
3412                     $message .= $owner_name_of{pack 'J', $self};
3413                 }
3414                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3415                 return;
3416             }
3417         }
3418
3419         # Sort with the range containing the lowest ordinal first, but if
3420         # two ranges start at the same code point, sort with the bigger range
3421         # of the two first, because it takes fewer cycles.
3422         if ($input_count > 1) {
3423             @records = sort { ($a->start <=> $b->start)
3424                                       or
3425                                     # if b is shorter than a, b->end will be
3426                                     # less than a->end, and we want to select
3427                                     # a, so want to return -1
3428                                     ($b->end <=> $a->end)
3429                                    } @records;
3430         }
3431
3432         my $new = $class->new(@_);
3433
3434         # Fold in records so long as they add new information.
3435         for my $set (@records) {
3436             my $start = $set->start;
3437             my $end   = $set->end;
3438             my $value = $set->value;
3439             my $type  = $set->type;
3440             if ($start > $new->max) {
3441                 $new->_add_delete('+', $start, $end, $value, Type => $type);
3442             }
3443             elsif ($end > $new->max) {
3444                 $new->_add_delete('+', $new->max +1, $end, $value,
3445                                                                 Type => $type);
3446             }
3447             elsif ($input_count == 1) {
3448                 # Here, overlaps existing range, but is from a single input,
3449                 # so preserve the multiple values from that input.
3450                 $new->_add_delete('+', $start, $end, $value, Type => $type,
3451                                                 Replace => $MULTIPLE_AFTER);
3452             }
3453         }
3454
3455         return $new;
3456     }
3457
3458     sub range_count {        # Return the number of ranges in the range list
3459         my $self = shift;
3460         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3461
3462         no overloading;
3463         return scalar @{$ranges{pack 'J', $self}};
3464     }
3465
3466     sub min {
3467         # Returns the minimum code point currently in the range list, or if
3468         # the range list is empty, 2 beyond the max possible.  This is a
3469         # method because used so rarely, that not worth saving between calls,
3470         # and having to worry about changing it as ranges are added and
3471         # deleted.
3472
3473         my $self = shift;
3474         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3475
3476         my $addr = do { no overloading; pack 'J', $self; };
3477
3478         # If the range list is empty, return a large value that isn't adjacent
3479         # to any that could be in the range list, for simpler tests
3480         return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3481         return $ranges{$addr}->[0]->start;
3482     }
3483
3484     sub contains {
3485         # Boolean: Is argument in the range list?  If so returns $i such that:
3486         #   range[$i]->end < $codepoint <= range[$i+1]->end
3487         # which is one beyond what you want; this is so that the 0th range
3488         # doesn't return false
3489         my $self = shift;
3490         my $codepoint = shift;
3491         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3492
3493         my $i = $self->_search_ranges($codepoint);
3494         return 0 unless defined $i;
3495
3496         # The search returns $i, such that
3497         #   range[$i-1]->end < $codepoint <= range[$i]->end
3498         # So is in the table if and only iff it is at least the start position
3499         # of range $i.
3500         no overloading;
3501         return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3502         return $i + 1;
3503     }
3504
3505     sub containing_range {
3506         # Returns the range object that contains the code point, undef if none
3507
3508         my $self = shift;
3509         my $codepoint = shift;
3510         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3511
3512         my $i = $self->contains($codepoint);
3513         return unless $i;
3514
3515         # contains() returns 1 beyond where we should look
3516         no overloading;
3517         return $ranges{pack 'J', $self}->[$i-1];
3518     }
3519
3520     sub value_of {
3521         # Returns the value associated with the code point, undef if none
3522
3523         my $self = shift;
3524         my $codepoint = shift;
3525         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3526
3527         my $range = $self->containing_range($codepoint);
3528         return unless defined $range;
3529
3530         return $range->value;
3531     }
3532
3533     sub type_of {
3534         # Returns the type of the range containing the code point, undef if
3535         # the code point is not in the table
3536
3537         my $self = shift;
3538         my $codepoint = shift;
3539         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3540
3541         my $range = $self->containing_range($codepoint);
3542         return unless defined $range;
3543
3544         return $range->type;
3545     }
3546
3547     sub _search_ranges {
3548         # Find the range in the list which contains a code point, or where it
3549         # should go if were to add it.  That is, it returns $i, such that:
3550         #   range[$i-1]->end < $codepoint <= range[$i]->end
3551         # Returns undef if no such $i is possible (e.g. at end of table), or
3552         # if there is an error.
3553
3554         my $self = shift;
3555         my $code_point = shift;
3556         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3557
3558         my $addr = do { no overloading; pack 'J', $self; };
3559
3560         return if $code_point > $max{$addr};
3561         my $r = $ranges{$addr};                # The current list of ranges
3562         my $range_list_size = scalar @$r;
3563         my $i;
3564
3565         use integer;        # want integer division
3566
3567         # Use the cached result as the starting guess for this one, because,
3568         # an experiment on 5.1 showed that 90% of the time the cache was the
3569         # same as the result on the next call (and 7% it was one less).
3570         $i = $_search_ranges_cache{$addr};
3571         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3572                                             # from an intervening deletion
3573         #local $to_trace = 1 if main::DEBUG;
3574         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3575         return $i if $code_point <= $r->[$i]->end
3576                      && ($i == 0 || $r->[$i-1]->end < $code_point);
3577
3578         # Here the cache doesn't yield the correct $i.  Try adding 1.
3579         if ($i < $range_list_size - 1
3580             && $r->[$i]->end < $code_point &&
3581             $code_point <= $r->[$i+1]->end)
3582         {
3583             $i++;
3584             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3585             $_search_ranges_cache{$addr} = $i;
3586             return $i;
3587         }
3588
3589         # Here, adding 1 also didn't work.  We do a binary search to
3590         # find the correct position, starting with current $i
3591         my $lower = 0;
3592         my $upper = $range_list_size - 1;
3593         while (1) {
3594             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3595
3596             if ($code_point <= $r->[$i]->end) {
3597
3598                 # Here we have met the upper constraint.  We can quit if we
3599                 # also meet the lower one.
3600                 last if $i == 0 || $r->[$i-1]->end < $code_point;
3601
3602                 $upper = $i;        # Still too high.
3603
3604             }
3605             else {
3606
3607                 # Here, $r[$i]->end < $code_point, so look higher up.
3608                 $lower = $i;
3609             }
3610
3611             # Split search domain in half to try again.
3612             my $temp = ($upper + $lower) / 2;
3613
3614             # No point in continuing unless $i changes for next time
3615             # in the loop.
3616             if ($temp == $i) {
3617
3618                 # We can't reach the highest element because of the averaging.
3619                 # So if one below the upper edge, force it there and try one
3620                 # more time.
3621                 if ($i == $range_list_size - 2) {
3622
3623                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3624                     $i = $range_list_size - 1;
3625
3626                     # Change $lower as well so if fails next time through,
3627                     # taking the average will yield the same $i, and we will
3628                     # quit with the error message just below.
3629                     $lower = $i;
3630                     next;
3631                 }
3632                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
3633                 return;
3634             }
3635             $i = $temp;
3636         } # End of while loop
3637
3638         if (main::DEBUG && $to_trace) {
3639             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3640             trace "i=  [ $i ]", $r->[$i];
3641             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3642         }
3643
3644         # Here we have found the offset.  Cache it as a starting point for the
3645         # next call.
3646         $_search_ranges_cache{$addr} = $i;
3647         return $i;
3648     }
3649
3650     sub _add_delete {
3651         # Add, replace or delete ranges to or from a list.  The $type
3652         # parameter gives which:
3653         #   '+' => insert or replace a range, returning a list of any changed
3654         #          ranges.
3655         #   '-' => delete a range, returning a list of any deleted ranges.
3656         #
3657         # The next three parameters give respectively the start, end, and
3658         # value associated with the range.  'value' should be null unless the
3659         # operation is '+';
3660         #
3661         # The range list is kept sorted so that the range with the lowest
3662         # starting position is first in the list, and generally, adjacent
3663         # ranges with the same values are merged into a single larger one (see
3664         # exceptions below).
3665         #
3666         # There are more parameters; all are key => value pairs:
3667         #   Type    gives the type of the value.  It is only valid for '+'.
3668         #           All ranges have types; if this parameter is omitted, 0 is
3669         #           assumed.  Ranges with type 0 are assumed to obey the
3670         #           Unicode rules for casing, etc; ranges with other types are
3671         #           not.  Otherwise, the type is arbitrary, for the caller's
3672         #           convenience, and looked at only by this routine to keep
3673         #           adjacent ranges of different types from being merged into
3674         #           a single larger range, and when Replace =>
3675         #           $IF_NOT_EQUIVALENT is specified (see just below).
3676         #   Replace  determines what to do if the range list already contains
3677         #            ranges which coincide with all or portions of the input
3678         #            range.  It is only valid for '+':
3679         #       => $NO            means that the new value is not to replace
3680         #                         any existing ones, but any empty gaps of the
3681         #                         range list coinciding with the input range
3682         #                         will be filled in with the new value.
3683         #       => $UNCONDITIONALLY  means to replace the existing values with
3684         #                         this one unconditionally.  However, if the
3685         #                         new and old values are identical, the
3686         #                         replacement is skipped to save cycles
3687         #       => $IF_NOT_EQUIVALENT means to replace the existing values
3688         #          (the default)  with this one if they are not equivalent.
3689         #                         Ranges are equivalent if their types are the
3690         #                         same, and they are the same string; or if
3691         #                         both are type 0 ranges, if their Unicode
3692         #                         standard forms are identical.  In this last
3693         #                         case, the routine chooses the more "modern"
3694         #                         one to use.  This is because some of the
3695         #                         older files are formatted with values that
3696         #                         are, for example, ALL CAPs, whereas the
3697         #                         derived files have a more modern style,
3698         #                         which looks better.  By looking for this
3699         #                         style when the pre-existing and replacement
3700         #                         standard forms are the same, we can move to
3701         #                         the modern style
3702         #       => $MULTIPLE_BEFORE means that if this range duplicates an
3703         #                         existing one, but has a different value,
3704         #                         don't replace the existing one, but insert
3705         #                         this, one so that the same range can occur
3706         #                         multiple times.  They are stored LIFO, so
3707         #                         that the final one inserted is the first one
3708         #                         returned in an ordered search of the table.
3709         #                         If this is an exact duplicate, including the
3710         #                         value, the original will be moved to be
3711         #                         first, before any other duplicate ranges
3712         #                         with different values.
3713         #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3714         #                         FIFO, so that this one is inserted after all
3715         #                         others that currently exist.  If this is an
3716         #                         exact duplicate, including value, of an
3717         #                         existing range, this one is discarded
3718         #                         (leaving the existing one in its original,
3719         #                         higher priority position
3720         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
3721         #
3722         # "same value" means identical for non-type-0 ranges, and it means
3723         # having the same standard forms for type-0 ranges.
3724
3725         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3726
3727         my $self = shift;
3728         my $operation = shift;   # '+' for add/replace; '-' for delete;
3729         my $start = shift;
3730         my $end   = shift;
3731         my $value = shift;
3732
3733         my %args = @_;
3734
3735         $value = "" if not defined $value;        # warning: $value can be "0"
3736
3737         my $replace = delete $args{'Replace'};
3738         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3739
3740         my $type = delete $args{'Type'};
3741         $type = 0 unless defined $type;
3742
3743         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3744
3745         my $addr = do { no overloading; pack 'J', $self; };
3746
3747         if ($operation ne '+' && $operation ne '-') {
3748             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
3749             return;
3750         }
3751         unless (defined $start && defined $end) {
3752             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
3753             return;
3754         }
3755         unless ($end >= $start) {
3756             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
3757             return;
3758         }
3759         #local $to_trace = 1 if main::DEBUG;
3760
3761         if ($operation eq '-') {
3762             if ($replace != $IF_NOT_EQUIVALENT) {
3763                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
3764                 $replace = $IF_NOT_EQUIVALENT;
3765             }
3766             if ($type) {
3767                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
3768                 $type = 0;
3769             }
3770             if ($value ne "") {
3771                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
3772                 $value = "";
3773             }
3774         }
3775
3776         my $r = $ranges{$addr};               # The current list of ranges
3777         my $range_list_size = scalar @$r;     # And its size
3778         my $max = $max{$addr};                # The current high code point in
3779                                               # the list of ranges
3780
3781         # Do a special case requiring fewer machine cycles when the new range
3782         # starts after the current highest point.  The Unicode input data is
3783         # structured so this is common.
3784         if ($start > $max) {
3785
3786             trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
3787             return if $operation eq '-'; # Deleting a non-existing range is a
3788                                          # no-op
3789
3790             # If the new range doesn't logically extend the current final one
3791             # in the range list, create a new range at the end of the range
3792             # list.  (max cleverly is initialized to a negative number not
3793             # adjacent to 0 if the range list is empty, so even adding a range
3794             # to an empty range list starting at 0 will have this 'if'
3795             # succeed.)
3796             if ($start > $max + 1        # non-adjacent means can't extend.
3797                 || @{$r}[-1]->value ne $value # values differ, can't extend.
3798                 || @{$r}[-1]->type != $type # types differ, can't extend.
3799             ) {
3800                 push @$r, Range->new($start, $end,
3801                                      Value => $value,
3802                                      Type => $type);
3803             }
3804             else {
3805
3806                 # Here, the new range starts just after the current highest in
3807                 # the range list, and they have the same type and value.
3808                 # Extend the current range to incorporate the new one.
3809                 @{$r}[-1]->set_end($end);
3810             }
3811
3812             # This becomes the new maximum.
3813             $max{$addr} = $end;
3814
3815             return;
3816         }
3817         #local $to_trace = 0 if main::DEBUG;
3818
3819         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3820
3821         # Here, the input range isn't after the whole rest of the range list.
3822         # Most likely 'splice' will be needed.  The rest of the routine finds
3823         # the needed splice parameters, and if necessary, does the splice.
3824         # First, find the offset parameter needed by the splice function for
3825         # the input range.  Note that the input range may span multiple
3826         # existing ones, but we'll worry about that later.  For now, just find
3827         # the beginning.  If the input range is to be inserted starting in a
3828         # position not currently in the range list, it must (obviously) come
3829         # just after the range below it, and just before the range above it.
3830         # Slightly less obviously, it will occupy the position currently
3831         # occupied by the range that is to come after it.  More formally, we
3832         # are looking for the position, $i, in the array of ranges, such that:
3833         #
3834         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3835         #
3836         # (The ordered relationships within existing ranges are also shown in
3837         # the equation above).  However, if the start of the input range is
3838         # within an existing range, the splice offset should point to that
3839         # existing range's position in the list; that is $i satisfies a
3840         # somewhat different equation, namely:
3841         #
3842         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3843         #
3844         # More briefly, $start can come before or after r[$i]->start, and at
3845         # this point, we don't know which it will be.  However, these
3846         # two equations share these constraints:
3847         #
3848         #   r[$i-1]->end < $start <= r[$i]->end
3849         #
3850         # And that is good enough to find $i.
3851
3852         my $i = $self->_search_ranges($start);
3853         if (! defined $i) {
3854             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
3855             return;
3856         }
3857
3858         # The search function returns $i such that:
3859         #
3860         # r[$i-1]->end < $start <= r[$i]->end
3861         #
3862         # That means that $i points to the first range in the range list
3863         # that could possibly be affected by this operation.  We still don't
3864         # know if the start of the input range is within r[$i], or if it
3865         # points to empty space between r[$i-1] and r[$i].
3866         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3867
3868         # Special case the insertion of data that is not to replace any
3869         # existing data.
3870         if ($replace == $NO) {  # If $NO, has to be operation '+'
3871             #local $to_trace = 1 if main::DEBUG;
3872             trace "Doesn't replace" if main::DEBUG && $to_trace;
3873
3874             # Here, the new range is to take effect only on those code points
3875             # that aren't already in an existing range.  This can be done by
3876             # looking through the existing range list and finding the gaps in
3877             # the ranges that this new range affects, and then calling this
3878             # function recursively on each of those gaps, leaving untouched
3879             # anything already in the list.  Gather up a list of the changed
3880             # gaps first so that changes to the internal state as new ranges
3881             # are added won't be a problem.
3882             my @gap_list;
3883
3884             # First, if the starting point of the input range is outside an
3885             # existing one, there is a gap from there to the beginning of the
3886             # existing range -- add a span to fill the part that this new
3887             # range occupies
3888             if ($start < $r->[$i]->start) {
3889                 push @gap_list, Range->new($start,
3890                                            main::min($end,
3891                                                      $r->[$i]->start - 1),
3892                                            Type => $type);
3893                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3894             }
3895
3896             # Then look through the range list for other gaps until we reach
3897             # the highest range affected by the input one.
3898             my $j;
3899             for ($j = $i+1; $j < $range_list_size; $j++) {
3900                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3901                 last if $end < $r->[$j]->start;
3902
3903                 # If there is a gap between when this range starts and the
3904                 # previous one ends, add a span to fill it.  Note that just
3905                 # because there are two ranges doesn't mean there is a
3906                 # non-zero gap between them.  It could be that they have
3907                 # different values or types
3908                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3909                     push @gap_list,
3910                         Range->new($r->[$j-1]->end + 1,
3911                                    $r->[$j]->start - 1,
3912                                    Type => $type);
3913                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3914                 }
3915             }
3916
3917             # Here, we have either found an existing range in the range list,
3918             # beyond the area affected by the input one, or we fell off the
3919             # end of the loop because the input range affects the whole rest
3920             # of the range list.  In either case, $j is 1 higher than the
3921             # highest affected range.  If $j == $i, it means that there are no
3922             # affected ranges, that the entire insertion is in the gap between
3923             # r[$i-1], and r[$i], which we already have taken care of before
3924             # the loop.
3925             # On the other hand, if there are affected ranges, it might be
3926             # that there is a gap that needs filling after the final such
3927             # range to the end of the input range
3928             if ($r->[$j-1]->end < $end) {
3929                     push @gap_list, Range->new(main::max($start,
3930                                                          $r->[$j-1]->end + 1),
3931                                                $end,
3932                                                Type => $type);
3933                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3934             }
3935
3936             # Call recursively to fill in all the gaps.
3937             foreach my $gap (@gap_list) {
3938                 $self->_add_delete($operation,
3939                                    $gap->start,
3940                                    $gap->end,
3941                                    $value,
3942                                    Type => $type);
3943             }
3944
3945             return;
3946         }
3947
3948         # Here, we have taken care of the case where $replace is $NO.
3949         # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3950         # If inserting a multiple record, this is where it goes, before the
3951         # first (if any) existing one if inserting LIFO.  (If this is to go
3952         # afterwards, FIFO, we below move the pointer to there.)  These imply
3953         # an insertion, and no change to any existing ranges.  Note that $i
3954         # can be -1 if this new range doesn't actually duplicate any existing,
3955         # and comes at the beginning of the list.
3956         if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
3957
3958             if ($start != $end) {
3959                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
3960                 return;
3961             }
3962
3963             # If the new code point is within a current range ...
3964             if ($end >= $r->[$i]->start) {
3965
3966                 # Don't add an exact duplicate, as it isn't really a multiple
3967                 my $existing_value = $r->[$i]->value;
3968                 my $existing_type = $r->[$i]->type;
3969                 return if $value eq $existing_value && $type eq $existing_type;
3970
3971                 # If the multiple value is part of an existing range, we want
3972                 # to split up that range, so that only the single code point
3973                 # is affected.  To do this, we first call ourselves
3974                 # recursively to delete that code point from the table, having
3975                 # preserved its current data above.  Then we call ourselves
3976                 # recursively again to add the new multiple, which we know by
3977                 # the test just above is different than the current code
3978                 # point's value, so it will become a range containing a single
3979                 # code point: just itself.  Finally, we add back in the
3980                 # pre-existing code point, which will again be a single code
3981                 # point range.  Because 'i' likely will have changed as a
3982                 # result of these operations, we can't just continue on, but
3983                 # do this operation recursively as well.  If we are inserting
3984                 # LIFO, the pre-existing code point needs to go after the new
3985                 # one, so use MULTIPLE_AFTER; and vice versa.
3986                 if ($r->[$i]->start != $r->[$i]->end) {
3987                     $self->_add_delete('-', $start, $end, "");
3988                     $self->_add_delete('+', $start, $end, $value, Type => $type);
3989                     return $self->_add_delete('+',
3990                             $start, $end,
3991                             $existing_value,
3992                             Type => $existing_type,
3993                             Replace => ($replace == $MULTIPLE_BEFORE)
3994                                        ? $MULTIPLE_AFTER
3995                                        : $MULTIPLE_BEFORE);
3996                 }
3997             }
3998
3999             # If to place this new record after, move to beyond all existing
4000             # ones; but don't add this one if identical to any of them, as it
4001             # isn't really a multiple.  This leaves the original order, so
4002             # that the current request is ignored.  The reasoning is that the
4003             # previous request that wanted this record to have high priority
4004             # should have precedence.
4005             if ($replace == $MULTIPLE_AFTER) {
4006                 while ($i < @$r && $r->[$i]->start == $start) {
4007                     return if $value eq $r->[$i]->value
4008                               && $type eq $r->[$i]->type;
4009                     $i++;
4010                 }
4011             }
4012             else {
4013                 # If instead we are to place this new record before any
4014                 # existing ones, remove any identical ones that come after it.
4015                 # This changes the existing order so that the new one is
4016                 # first, as is being requested.
4017                 for (my $j = $i + 1;
4018                      $j < @$r && $r->[$j]->start == $start;
4019                      $j++)
4020                 {
4021                     if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4022                         splice @$r, $j, 1;
4023                         last;   # There should only be one instance, so no
4024                                 # need to keep looking
4025                     }
4026                 }
4027             }
4028
4029             trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4030             my @return = splice @$r,
4031                                 $i,
4032                                 0,
4033                                 Range->new($start,
4034                                            $end,
4035                                            Value => $value,
4036                                            Type => $type);
4037             if (main::DEBUG && $to_trace) {
4038                 trace "After splice:";
4039                 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4040                 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4041                 trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
4042                 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4043                 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4044                 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4045             }
4046             return @return;
4047         }
4048
4049         # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
4050         # leaves delete, insert, and replace either unconditionally or if not
4051         # equivalent.  $i still points to the first potential affected range.
4052         # Now find the highest range affected, which will determine the length
4053         # parameter to splice.  (The input range can span multiple existing
4054         # ones.)  If this isn't a deletion, while we are looking through the
4055         # range list, see also if this is a replacement rather than a clean
4056         # insertion; that is if it will change the values of at least one
4057         # existing range.  Start off assuming it is an insert, until find it
4058         # isn't.
4059         my $clean_insert = $operation eq '+';
4060         my $j;        # This will point to the highest affected range
4061
4062         # For non-zero types, the standard form is the value itself;
4063         my $standard_form = ($type) ? $value : main::standardize($value);
4064
4065         for ($j = $i; $j < $range_list_size; $j++) {
4066             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4067
4068             # If find a range that it doesn't overlap into, we can stop
4069             # searching
4070             last if $end < $r->[$j]->start;
4071
4072             # Here, overlaps the range at $j.  If the values don't match,
4073             # and so far we think this is a clean insertion, it becomes a
4074             # non-clean insertion, i.e., a 'change' or 'replace' instead.
4075             if ($clean_insert) {
4076                 if ($r->[$j]->standard_form ne $standard_form) {
4077                     $clean_insert = 0;
4078                     if ($replace == $CROAK) {
4079                         main::croak("The range to add "
4080                         . sprintf("%04X", $start)
4081                         . '-'
4082                         . sprintf("%04X", $end)
4083                         . " with value '$value' overlaps an existing range $r->[$j]");
4084                     }
4085                 }
4086                 else {
4087
4088                     # Here, the two values are essentially the same.  If the
4089                     # two are actually identical, replacing wouldn't change
4090                     # anything so skip it.
4091                     my $pre_existing = $r->[$j]->value;
4092                     if ($pre_existing ne $value) {
4093
4094                         # Here the new and old standardized values are the
4095                         # same, but the non-standardized values aren't.  If
4096                         # replacing unconditionally, then replace
4097                         if( $replace == $UNCONDITIONALLY) {
4098                             $clean_insert = 0;
4099                         }
4100                         else {
4101
4102                             # Here, are replacing conditionally.  Decide to
4103                             # replace or not based on which appears to look
4104                             # the "nicest".  If one is mixed case and the
4105                             # other isn't, choose the mixed case one.
4106                             my $new_mixed = $value =~ /[A-Z]/
4107                                             && $value =~ /[a-z]/;
4108                             my $old_mixed = $pre_existing =~ /[A-Z]/
4109                                             && $pre_existing =~ /[a-z]/;
4110
4111                             if ($old_mixed != $new_mixed) {
4112                                 $clean_insert = 0 if $new_mixed;
4113                                 if (main::DEBUG && $to_trace) {
4114                                     if ($clean_insert) {
4115                                         trace "Retaining $pre_existing over $value";
4116                                     }
4117                                     else {
4118                                         trace "Replacing $pre_existing with $value";
4119                                     }
4120                                 }
4121                             }
4122                             else {
4123
4124                                 # Here casing wasn't different between the two.
4125                                 # If one has hyphens or underscores and the
4126                                 # other doesn't, choose the one with the
4127                                 # punctuation.
4128                                 my $new_punct = $value =~ /[-_]/;
4129                                 my $old_punct = $pre_existing =~ /[-_]/;
4130
4131                                 if ($old_punct != $new_punct) {
4132                                     $clean_insert = 0 if $new_punct;
4133                                     if (main::DEBUG && $to_trace) {
4134                                         if ($clean_insert) {
4135                                             trace "Retaining $pre_existing over $value";
4136                                         }
4137                                         else {
4138                                             trace "Replacing $pre_existing with $value";
4139                                         }
4140                                     }
4141                                 }   # else existing one is just as "good";
4142                                     # retain it to save cycles.
4143                             }
4144                         }
4145                     }
4146                 }
4147             }
4148         } # End of loop looking for highest affected range.
4149
4150         # Here, $j points to one beyond the highest range that this insertion
4151         # affects (hence to beyond the range list if that range is the final
4152         # one in the range list).
4153
4154         # The splice length is all the affected ranges.  Get it before
4155         # subtracting, for efficiency, so we don't have to later add 1.
4156         my $length = $j - $i;
4157
4158         $j--;        # $j now points to the highest affected range.
4159         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4160
4161         # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4162         # $j points to the highest affected range.  But it can be < $i or even
4163         # -1.  These happen only if the insertion is entirely in the gap
4164         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
4165         # above exited first time through with $end < $r->[$i]->start.  (And
4166         # then we subtracted one from j)  This implies also that $start <
4167         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4168         # $start, so the entire input range is in the gap.
4169         if ($j < $i) {
4170
4171             # Here the entire input range is in the gap before $i.
4172
4173             if (main::DEBUG && $to_trace) {
4174                 if ($i) {
4175                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
4176                 }
4177                 else {
4178                     trace "Entire range is before $r->[$i]";
4179                 }
4180             }
4181             return if $operation ne '+'; # Deletion of a non-existent range is
4182                                          # a no-op
4183         }
4184         else {
4185
4186             # Here part of the input range is not in the gap before $i.  Thus,
4187             # there is at least one affected one, and $j points to the highest
4188             # such one.
4189
4190             # At this point, here is the situation:
4191             # This is not an insertion of a multiple, nor of tentative ($NO)
4192             # data.
4193             #   $i  points to the first element in the current range list that
4194             #            may be affected by this operation.  In fact, we know
4195             #            that the range at $i is affected because we are in
4196             #            the else branch of this 'if'
4197             #   $j  points to the highest affected range.
4198             # In other words,
4199             #   r[$i-1]->end < $start <= r[$i]->end
4200             # And:
4201             #   r[$i-1]->end < $start <= $end <= r[$j]->end
4202             #
4203             # Also:
4204             #   $clean_insert is a boolean which is set true if and only if
4205             #        this is a "clean insertion", i.e., not a change nor a
4206             #        deletion (multiple was handled above).
4207
4208             # We now have enough information to decide if this call is a no-op
4209             # or not.  It is a no-op if this is an insertion of already
4210             # existing data.
4211
4212             if (main::DEBUG && $to_trace && $clean_insert
4213                                          && $i == $j
4214                                          && $start >= $r->[$i]->start)
4215             {
4216                     trace "no-op";
4217             }
4218             return if $clean_insert
4219                       && $i == $j # more than one affected range => not no-op
4220
4221                       # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
4222                       # Further, $start and/or $end is >= r[$i]->start
4223                       # The test below hence guarantees that
4224                       #     r[$i]->start < $start <= $end <= r[$i]->end
4225                       # This means the input range is contained entirely in
4226                       # the one at $i, so is a no-op
4227                       && $start >= $r->[$i]->start;
4228         }
4229
4230         # Here, we know that some action will have to be taken.  We have
4231         # calculated the offset and length (though adjustments may be needed)
4232         # for the splice.  Now start constructing the replacement list.
4233         my @replacement;
4234         my $splice_start = $i;
4235
4236         my $extends_below;
4237         my $extends_above;
4238
4239         # See if should extend any adjacent ranges.
4240         if ($operation eq '-') { # Don't extend deletions
4241             $extends_below = $extends_above = 0;
4242         }
4243         else {  # Here, should extend any adjacent ranges.  See if there are
4244                 # any.
4245             $extends_below = ($i > 0
4246                             # can't extend unless adjacent
4247                             && $r->[$i-1]->end == $start -1
4248                             # can't extend unless are same standard value
4249                             && $r->[$i-1]->standard_form eq $standard_form
4250                             # can't extend unless share type
4251                             && $r->[$i-1]->type == $type);
4252             $extends_above = ($j+1 < $range_list_size
4253                             && $r->[$j+1]->start == $end +1
4254                             && $r->[$j+1]->standard_form eq $standard_form
4255                             && $r->[$j+1]->type == $type);
4256         }
4257         if ($extends_below && $extends_above) { # Adds to both
4258             $splice_start--;     # start replace at element below
4259             $length += 2;        # will replace on both sides
4260             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4261
4262             # The result will fill in any gap, replacing both sides, and
4263             # create one large range.
4264             @replacement = Range->new($r->[$i-1]->start,
4265                                       $r->[$j+1]->end,
4266                                       Value => $value,
4267                                       Type => $type);
4268         }
4269         else {
4270
4271             # Here we know that the result won't just be the conglomeration of
4272             # a new range with both its adjacent neighbors.  But it could
4273             # extend one of them.
4274
4275             if ($extends_below) {
4276
4277                 # Here the new element adds to the one below, but not to the
4278                 # one above.  If inserting, and only to that one range,  can
4279                 # just change its ending to include the new one.
4280                 if ($length == 0 && $clean_insert) {
4281                     $r->[$i-1]->set_end($end);
4282                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4283                     return;
4284                 }
4285                 else {
4286                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4287                     $splice_start--;        # start replace at element below
4288                     $length++;              # will replace the element below
4289                     $start = $r->[$i-1]->start;
4290                 }
4291             }
4292             elsif ($extends_above) {
4293
4294                 # Here the new element adds to the one above, but not below.
4295                 # Mirror the code above
4296                 if ($length == 0 && $clean_insert) {
4297                     $r->[$j+1]->set_start($start);
4298                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4299                     return;
4300                 }
4301                 else {
4302                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4303                     $length++;        # will replace the element above
4304                     $end = $r->[$j+1]->end;
4305                 }
4306             }
4307
4308             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4309
4310             # Finally, here we know there will have to be a splice.
4311             # If the change or delete affects only the highest portion of the
4312             # first affected range, the range will have to be split.  The
4313             # splice will remove the whole range, but will replace it by a new
4314             # range containing just the unaffected part.  So, in this case,
4315             # add to the replacement list just this unaffected portion.
4316             if (! $extends_below
4317                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4318             {
4319                 push @replacement,
4320                     Range->new($r->[$i]->start,
4321                                $start - 1,
4322                                Value => $r->[$i]->value,
4323                                Type => $r->[$i]->type);
4324             }
4325
4326             # In the case of an insert or change, but not a delete, we have to
4327             # put in the new stuff;  this comes next.
4328             if ($operation eq '+') {
4329                 push @replacement, Range->new($start,
4330                                               $end,
4331                                               Value => $value,
4332                                               Type => $type);
4333             }
4334
4335             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4336             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4337
4338             # And finally, if we're changing or deleting only a portion of the
4339             # highest affected range, it must be split, as the lowest one was.
4340             if (! $extends_above
4341                 && $j >= 0  # Remember that j can be -1 if before first
4342                             # current element
4343                 && $end >= $r->[$j]->start
4344                 && $end < $r->[$j]->end)
4345             {
4346                 push @replacement,
4347                     Range->new($end + 1,
4348                                $r->[$j]->end,
4349                                Value => $r->[$j]->value,
4350                                Type => $r->[$j]->type);
4351             }
4352         }
4353
4354         # And do the splice, as calculated above
4355         if (main::DEBUG && $to_trace) {
4356             trace "replacing $length element(s) at $i with ";
4357             foreach my $replacement (@replacement) {
4358                 trace "    $replacement";
4359             }
4360             trace "Before splice:";
4361             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4362             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4363             trace "i  =[", $i, "]", $r->[$i];
4364             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4365             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4366         }
4367
4368         my @return = splice @$r, $splice_start, $length, @replacement;
4369
4370         if (main::DEBUG && $to_trace) {
4371             trace "After splice:";
4372             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4373             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4374             trace "i  =[", $i, "]", $r->[$i];
4375             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4376             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4377             trace "removed ", @return if @return;
4378         }
4379
4380         # An actual deletion could have changed the maximum in the list.
4381         # There was no deletion if the splice didn't return something, but
4382         # otherwise recalculate it.  This is done too rarely to worry about
4383         # performance.
4384         if ($operation eq '-' && @return) {
4385             if (@$r) {
4386                 $max{$addr} = $r->[-1]->end;
4387             }
4388             else {  # Now empty
4389                 $max{$addr} = $max_init;
4390             }
4391         }
4392         return @return;
4393     }
4394
4395     sub reset_each_range {  # reset the iterator for each_range();
4396         my $self = shift;
4397         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4398
4399         no overloading;
4400         undef $each_range_iterator{pack 'J', $self};
4401         return;
4402     }
4403
4404     sub each_range {
4405         # Iterate over each range in a range list.  Results are undefined if
4406         # the range list is changed during the iteration.
4407
4408         my $self = shift;
4409         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4410
4411         my $addr = do { no overloading; pack 'J', $self; };
4412
4413         return if $self->is_empty;
4414
4415         $each_range_iterator{$addr} = -1
4416                                 if ! defined $each_range_iterator{$addr};
4417         $each_range_iterator{$addr}++;
4418         return $ranges{$addr}->[$each_range_iterator{$addr}]
4419                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
4420         undef $each_range_iterator{$addr};
4421         return;
4422     }
4423
4424     sub count {        # Returns count of code points in range list
4425         my $self = shift;
4426         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4427
4428         my $addr = do { no overloading; pack 'J', $self; };
4429
4430         my $count = 0;
4431         foreach my $range (@{$ranges{$addr}}) {
4432             $count += $range->end - $range->start + 1;
4433         }
4434         return $count;
4435     }
4436
4437     sub delete_range {    # Delete a range
4438         my $self = shift;
4439         my $start = shift;
4440         my $end = shift;
4441
4442         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4443
4444         return $self->_add_delete('-', $start, $end, "");
4445     }
4446
4447     sub is_empty { # Returns boolean as to if a range list is empty
4448         my $self = shift;
4449         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4450
4451         no overloading;
4452         return scalar @{$ranges{pack 'J', $self}} == 0;
4453     }
4454
4455     sub hash {
4456         # Quickly returns a scalar suitable for separating tables into
4457         # buckets, i.e. it is a hash function of the contents of a table, so
4458         # there are relatively few conflicts.
4459
4460         my $self = shift;
4461         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4462
4463         my $addr = do { no overloading; pack 'J', $self; };
4464
4465         # These are quickly computable.  Return looks like 'min..max;count'
4466         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4467     }
4468 } # End closure for _Range_List_Base
4469
4470 package Range_List;
4471 use parent '-norequire', '_Range_List_Base';
4472
4473 # A Range_List is a range list for match tables; i.e. the range values are
4474 # not significant.  Thus a number of operations can be safely added to it,
4475 # such as inversion, intersection.  Note that union is also an unsafe
4476 # operation when range values are cared about, and that method is in the base
4477 # class, not here.  But things are set up so that that method is callable only
4478 # during initialization.  Only in this derived class, is there an operation
4479 # that combines two tables.  A Range_Map can thus be used to initialize a
4480 # Range_List, and its mappings will be in the list, but are not significant to
4481 # this class.
4482
4483 sub trace { return main::trace(@_); }
4484
4485 { # Closure
4486
4487     use overload
4488         fallback => 0,
4489         '+' => sub { my $self = shift;
4490                     my $other = shift;
4491
4492                     return $self->_union($other)
4493                 },
4494         '+=' => sub { my $self = shift;
4495                     my $other = shift;
4496                     my $reversed = shift;
4497
4498                     if ($reversed) {
4499                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4500                         . ref($other)
4501                         . ' += '
4502                         . ref($self)
4503                         . "'.  undef returned.");
4504                         return;
4505                     }
4506
4507                     return $self->_union($other)
4508                 },
4509         '&' => sub { my $self = shift;
4510                     my $other = shift;
4511
4512                     return $self->_intersect($other, 0);
4513                 },
4514         '&=' => sub { my $self = shift;
4515                     my $other = shift;
4516                     my $reversed = shift;
4517
4518                     if ($reversed) {
4519                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4520                         . ref($other)
4521                         . ' &= '
4522                         . ref($self)
4523                         . "'.  undef returned.");
4524                         return;
4525                     }
4526
4527                     return $self->_intersect($other, 0);
4528                 },
4529         '~' => "_invert",
4530         '-' => "_subtract",
4531     ;
4532
4533     sub _invert {
4534         # Returns a new Range_List that gives all code points not in $self.
4535
4536         my $self = shift;
4537
4538         my $new = Range_List->new;
4539
4540         # Go through each range in the table, finding the gaps between them
4541         my $max = -1;   # Set so no gap before range beginning at 0
4542         for my $range ($self->ranges) {
4543             my $start = $range->start;
4544             my $end   = $range->end;
4545
4546             # If there is a gap before this range, the inverse will contain
4547             # that gap.
4548             if ($start > $max + 1) {
4549                 $new->add_range($max + 1, $start - 1);
4550             }
4551             $max = $end;
4552         }
4553
4554         # And finally, add the gap from the end of the table to the max
4555         # possible code point
4556         if ($max < $MAX_WORKING_CODEPOINT) {
4557             $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
4558         }
4559         return $new;
4560     }
4561
4562     sub _subtract {
4563         # Returns a new Range_List with the argument deleted from it.  The
4564         # argument can be a single code point, a range, or something that has
4565         # a range, with the _range_list() method on it returning them
4566
4567         my $self = shift;
4568         my $other = shift;
4569         my $reversed = shift;
4570         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4571
4572         if ($reversed) {
4573             Carp::my_carp_bug("Bad news.  Can't cope with '"
4574             . ref($other)
4575             . ' - '
4576             . ref($self)
4577             . "'.  undef returned.");
4578             return;
4579         }
4580
4581         my $new = Range_List->new(Initialize => $self);
4582
4583         if (! ref $other) { # Single code point
4584             $new->delete_range($other, $other);
4585         }
4586         elsif ($other->isa('Range')) {
4587             $new->delete_range($other->start, $other->end);
4588         }
4589         elsif ($other->can('_range_list')) {
4590             foreach my $range ($other->_range_list->ranges) {
4591                 $new->delete_range($range->start, $range->end);
4592             }
4593         }
4594         else {
4595             Carp::my_carp_bug("Can't cope with a "
4596                         . ref($other)
4597                         . " argument to '-'.  Subtraction ignored."
4598                         );
4599             return $self;
4600         }
4601
4602         return $new;
4603     }
4604
4605     sub _intersect {
4606         # Returns either a boolean giving whether the two inputs' range lists
4607         # intersect (overlap), or a new Range_List containing the intersection
4608         # of the two lists.  The optional final parameter being true indicates
4609         # to do the check instead of the intersection.
4610
4611         my $a_object = shift;
4612         my $b_object = shift;
4613         my $check_if_overlapping = shift;
4614         $check_if_overlapping = 0 unless defined $check_if_overlapping;
4615         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4616
4617         if (! defined $b_object) {
4618             my $message = "";
4619             $message .= $a_object->_owner_name_of if defined $a_object;
4620             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4621             return;
4622         }
4623
4624         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4625         # Thus the intersection could be much more simply be written:
4626         #   return ~(~$a_object + ~$b_object);
4627         # But, this is slower, and when taking the inverse of a large
4628         # range_size_1 table, back when such tables were always stored that
4629         # way, it became prohibitively slow, hence the code was changed to the
4630         # below
4631
4632         if ($b_object->isa('Range')) {
4633             $b_object = Range_List->new(Initialize => $b_object,
4634                                         Owner => $a_object->_owner_name_of);
4635         }
4636         $b_object = $b_object->_range_list if $b_object->can('_range_list');
4637
4638         my @a_ranges = $a_object->ranges;
4639         my @b_ranges = $b_object->ranges;
4640
4641         #local $to_trace = 1 if main::DEBUG;
4642         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4643
4644         # Start with the first range in each list
4645         my $a_i = 0;
4646         my $range_a = $a_ranges[$a_i];
4647         my $b_i = 0;
4648         my $range_b = $b_ranges[$b_i];
4649
4650         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4651                                                 if ! $check_if_overlapping;
4652
4653         # If either list is empty, there is no intersection and no overlap
4654         if (! defined $range_a || ! defined $range_b) {
4655             return $check_if_overlapping ? 0 : $new;
4656         }
4657         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4658
4659         # Otherwise, must calculate the intersection/overlap.  Start with the
4660         # very first code point in each list
4661         my $a = $range_a->start;
4662         my $b = $range_b->start;
4663
4664         # Loop through all the ranges of each list; in each iteration, $a and
4665         # $b are the current code points in their respective lists
4666         while (1) {
4667
4668             # If $a and $b are the same code point, ...
4669             if ($a == $b) {
4670
4671                 # it means the lists overlap.  If just checking for overlap
4672                 # know the answer now,
4673                 return 1 if $check_if_overlapping;
4674
4675                 # The intersection includes this code point plus anything else
4676                 # common to both current ranges.
4677                 my $start = $a;
4678                 my $end = main::min($range_a->end, $range_b->end);
4679                 if (! $check_if_overlapping) {
4680                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4681                     $new->add_range($start, $end);
4682                 }
4683
4684                 # Skip ahead to the end of the current intersect
4685                 $a = $b = $end;
4686
4687                 # If the current intersect ends at the end of either range (as
4688                 # it must for at least one of them), the next possible one
4689                 # will be the beginning code point in it's list's next range.
4690                 if ($a == $range_a->end) {
4691                     $range_a = $a_ranges[++$a_i];
4692                     last unless defined $range_a;
4693                     $a = $range_a->start;
4694                 }
4695                 if ($b == $range_b->end) {
4696                     $range_b = $b_ranges[++$b_i];
4697                     last unless defined $range_b;
4698                     $b = $range_b->start;
4699                 }
4700
4701                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4702             }
4703             elsif ($a < $b) {
4704
4705                 # Not equal, but if the range containing $a encompasses $b,
4706                 # change $a to be the middle of the range where it does equal
4707                 # $b, so the next iteration will get the intersection
4708                 if ($range_a->end >= $b) {
4709                     $a = $b;
4710                 }
4711                 else {
4712
4713                     # Here, the current range containing $a is entirely below
4714                     # $b.  Go try to find a range that could contain $b.
4715                     $a_i = $a_object->_search_ranges($b);
4716
4717                     # If no range found, quit.
4718                     last unless defined $a_i;
4719
4720                     # The search returns $a_i, such that
4721                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4722                     # Set $a to the beginning of this new range, and repeat.
4723                     $range_a = $a_ranges[$a_i];
4724                     $a = $range_a->start;
4725                 }
4726             }
4727             else { # Here, $b < $a.
4728
4729                 # Mirror image code to the leg just above
4730                 if ($range_b->end >= $a) {
4731                     $b = $a;
4732                 }
4733                 else {
4734                     $b_i = $b_object->_search_ranges($a);
4735                     last unless defined $b_i;
4736                     $range_b = $b_ranges[$b_i];
4737                     $b = $range_b->start;
4738                 }
4739             }
4740         } # End of looping through ranges.
4741
4742         # Intersection fully computed, or now know that there is no overlap
4743         return $check_if_overlapping ? 0 : $new;
4744     }
4745
4746     sub overlaps {
4747         # Returns boolean giving whether the two arguments overlap somewhere
4748
4749         my $self = shift;
4750         my $other = shift;
4751         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4752
4753         return $self->_intersect($other, 1);
4754     }
4755
4756     sub add_range {
4757         # Add a range to the list.
4758
4759         my $self = shift;
4760         my $start = shift;
4761         my $end = shift;
4762         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4763
4764         return $self->_add_delete('+', $start, $end, "");
4765     }
4766
4767     sub matches_identically_to {
4768         # Return a boolean as to whether or not two Range_Lists match identical
4769         # sets of code points.
4770
4771         my $self = shift;
4772         my $other = shift;
4773         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4774
4775         # These are ordered in increasing real time to figure out (at least
4776         # until a patch changes that and doesn't change this)
4777         return 0 if $self->max != $other->max;
4778         return 0 if $self->min != $other->min;
4779         return 0 if $self->range_count != $other->range_count;
4780         return 0 if $self->count != $other->count;
4781
4782         # Here they could be identical because all the tests above passed.
4783         # The loop below is somewhat simpler since we know they have the same
4784         # number of elements.  Compare range by range, until reach the end or
4785         # find something that differs.
4786         my @a_ranges = $self->ranges;
4787         my @b_ranges = $other->ranges;
4788         for my $i (0 .. @a_ranges - 1) {
4789             my $a = $a_ranges[$i];
4790             my $b = $b_ranges[$i];
4791             trace "self $a; other $b" if main::DEBUG && $to_trace;
4792             return 0 if ! defined $b
4793                         || $a->start != $b->start
4794                         || $a->end != $b->end;
4795         }
4796         return 1;
4797     }
4798
4799     sub is_code_point_usable {
4800         # This used only for making the test script.  See if the input
4801         # proposed trial code point is one that Perl will handle.  If second
4802         # parameter is 0, it won't select some code points for various
4803         # reasons, noted below.
4804
4805         my $code = shift;
4806         my $try_hard = shift;
4807         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4808
4809         return 0 if $code < 0;                # Never use a negative
4810
4811         # shun null.  I'm (khw) not sure why this was done, but NULL would be
4812         # the character very frequently used.
4813         return $try_hard if $code == 0x0000;
4814
4815         # shun non-character code points.
4816         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4817         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4818
4819         return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
4820         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4821
4822         return 1;
4823     }
4824
4825     sub get_valid_code_point {
4826         # Return a code point that's part of the range list.  Returns nothing
4827         # if the table is empty or we can't find a suitable code point.  This
4828         # used only for making the test script.
4829
4830         my $self = shift;
4831         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4832
4833         my $addr = do { no overloading; pack 'J', $self; };
4834
4835         # On first pass, don't choose less desirable code points; if no good
4836         # one is found, repeat, allowing a less desirable one to be selected.
4837         for my $try_hard (0, 1) {
4838
4839             # Look through all the ranges for a usable code point.
4840             for my $set (reverse $self->ranges) {
4841
4842                 # Try the edge cases first, starting with the end point of the
4843                 # range.
4844                 my $end = $set->end;
4845                 return $end if is_code_point_usable($end, $try_hard);
4846                 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
4847
4848                 # End point didn't, work.  Start at the beginning and try
4849                 # every one until find one that does work.
4850                 for my $trial ($set->start .. $end - 1) {
4851                     return $trial if is_code_point_usable($trial, $try_hard);
4852                 }
4853             }
4854         }
4855         return ();  # If none found, give up.
4856     }
4857
4858     sub get_invalid_code_point {
4859         # Return a code point that's not part of the table.  Returns nothing
4860         # if the table covers all code points or a suitable code point can't
4861         # be found.  This used only for making the test script.
4862
4863         my $self = shift;
4864         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4865
4866         # Just find a valid code point of the inverse, if any.
4867         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4868     }
4869 } # end closure for Range_List
4870
4871 package Range_Map;
4872 use parent '-norequire', '_Range_List_Base';
4873
4874 # A Range_Map is a range list in which the range values (called maps) are
4875 # significant, and hence shouldn't be manipulated by our other code, which
4876 # could be ambiguous or lose things.  For example, in taking the union of two
4877 # lists, which share code points, but which have differing values, which one
4878 # has precedence in the union?
4879 # It turns out that these operations aren't really necessary for map tables,
4880 # and so this class was created to make sure they aren't accidentally
4881 # applied to them.
4882
4883 { # Closure
4884
4885     sub add_map {
4886         # Add a range containing a mapping value to the list
4887
4888         my $self = shift;
4889         # Rest of parameters passed on
4890
4891         return $self->_add_delete('+', @_);
4892     }
4893
4894     sub add_duplicate {
4895         # Adds entry to a range list which can duplicate an existing entry
4896
4897         my $self = shift;
4898         my $code_point = shift;
4899         my $value = shift;
4900         my %args = @_;
4901         my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4902         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4903
4904         return $self->add_map($code_point, $code_point,
4905                                 $value, Replace => $replace);
4906     }
4907 } # End of closure for package Range_Map
4908
4909 package _Base_Table;
4910
4911 # A table is the basic data structure that gets written out into a file for
4912 # use by the Perl core.  This is the abstract base class implementing the
4913 # common elements from the derived ones.  A list of the methods to be
4914 # furnished by an implementing class is just after the constructor.
4915
4916 sub standardize { return main::standardize($_[0]); }
4917 sub trace { return main::trace(@_); }
4918
4919 { # Closure
4920
4921     main::setup_package();
4922
4923     my %range_list;
4924     # Object containing the ranges of the table.
4925     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4926
4927     my %full_name;
4928     # The full table name.
4929     main::set_access('full_name', \%full_name, 'r');
4930
4931     my %name;
4932     # The table name, almost always shorter
4933     main::set_access('name', \%name, 'r');
4934
4935     my %short_name;
4936     # The shortest of all the aliases for this table, with underscores removed
4937     main::set_access('short_name', \%short_name);
4938
4939     my %nominal_short_name_length;
4940     # The length of short_name before removing underscores
4941     main::set_access('nominal_short_name_length',
4942                     \%nominal_short_name_length);
4943
4944     my %complete_name;
4945     # The complete name, including property.
4946     main::set_access('complete_name', \%complete_name, 'r');
4947
4948     my %property;
4949     # Parent property this table is attached to.
4950     main::set_access('property', \%property, 'r');
4951
4952     my %aliases;
4953     # Ordered list of alias objects of the table's name.  The first ones in
4954     # the list are output first in comments
4955     main::set_access('aliases', \%aliases, 'readable_array');
4956
4957     my %comment;
4958     # A comment associated with the table for human readers of the files
4959     main::set_access('comment', \%comment, 's');
4960
4961     my %description;
4962     # A comment giving a short description of the table's meaning for human
4963     # readers of the files.
4964     main::set_access('description', \%description, 'readable_array');
4965
4966     my %note;
4967     # A comment giving a short note about the table for human readers of the
4968     # files.
4969     main::set_access('note', \%note, 'readable_array');
4970
4971     my %fate;
4972     # Enum; there are a number of possibilities for what happens to this
4973     # table: it could be normal, or suppressed, or not for external use.  See
4974     # values at definition for $SUPPRESSED.
4975     main::set_access('fate', \%fate, 'r');
4976
4977     my %find_table_from_alias;
4978     # The parent property passes this pointer to a hash which this class adds
4979     # all its aliases to, so that the parent can quickly take an alias and
4980     # find this table.
4981     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4982
4983     my %locked;
4984     # After this table is made equivalent to another one; we shouldn't go
4985     # changing the contents because that could mean it's no longer equivalent
4986     main::set_access('locked', \%locked, 'r');
4987
4988     my %file_path;
4989     # This gives the final path to the file containing the table.  Each
4990     # directory in the path is an element in the array
4991     main::set_access('file_path', \%file_path, 'readable_array');
4992
4993     my %status;
4994     # What is the table's status, normal, $OBSOLETE, etc.  Enum
4995     main::set_access('status', \%status, 'r');
4996
4997     my %status_info;
4998     # A comment about its being obsolete, or whatever non normal status it has
4999     main::set_access('status_info', \%status_info, 'r');
5000
5001     my %caseless_equivalent;
5002     # The table this is equivalent to under /i matching, if any.
5003     main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5004
5005     my %range_size_1;
5006     # Is the table to be output with each range only a single code point?
5007     # This is done to avoid breaking existing code that may have come to rely
5008     # on this behavior in previous versions of this program.)
5009     main::set_access('range_size_1', \%range_size_1, 'r', 's');
5010
5011     my %perl_extension;
5012     # A boolean set iff this table is a Perl extension to the Unicode
5013     # standard.
5014     main::set_access('perl_extension', \%perl_extension, 'r');
5015
5016     my %output_range_counts;
5017     # A boolean set iff this table is to have comments written in the
5018     # output file that contain the number of code points in the range.
5019     # The constructor can override the global flag of the same name.
5020     main::set_access('output_range_counts', \%output_range_counts, 'r');
5021
5022     my %write_as_invlist;
5023     # A boolean set iff the output file for this table is to be in the form of
5024     # an inversion list/map.
5025     main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5026
5027     my %format;
5028     # The format of the entries of the table.  This is calculated from the
5029     # data in the table (or passed in the constructor).  This is an enum e.g.,
5030     # $STRING_FORMAT.  It is marked protected as it should not be generally
5031     # used to override calculations.
5032     main::set_access('format', \%format, 'r', 'p_s');
5033
5034     sub new {
5035         # All arguments are key => value pairs, which you can see below, most
5036         # of which match fields documented above.  Otherwise: Re_Pod_Entry,
5037         # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5038         # documented in the Alias package
5039
5040         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5041
5042         my $class = shift;
5043
5044         my $self = bless \do { my $anonymous_scalar }, $class;
5045         my $addr = do { no overloading; pack 'J', $self; };
5046
5047         my %args = @_;
5048
5049         $name{$addr} = delete $args{'Name'};
5050         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5051         $full_name{$addr} = delete $args{'Full_Name'};
5052         my $complete_name = $complete_name{$addr}
5053                           = delete $args{'Complete_Name'};
5054         $format{$addr} = delete $args{'Format'};
5055         $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5056         $property{$addr} = delete $args{'_Property'};
5057         $range_list{$addr} = delete $args{'_Range_List'};
5058         $status{$addr} = delete $args{'Status'} || $NORMAL;
5059         $status_info{$addr} = delete $args{'_Status_Info'} || "";
5060         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5061         $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5062         $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5063         $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5064         my $ucd = delete $args{'UCD'};
5065
5066         my $description = delete $args{'Description'};
5067         my $ok_as_filename = delete $args{'OK_as_Filename'};
5068         my $loose_match = delete $args{'Fuzzy'};
5069         my $note = delete $args{'Note'};
5070         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5071         my $perl_extension = delete $args{'Perl_Extension'};
5072
5073         # Shouldn't have any left over
5074         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5075
5076         # Can't use || above because conceivably the name could be 0, and
5077         # can't use // operator in case this program gets used in Perl 5.8
5078         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5079         $output_range_counts{$addr} = $output_range_counts if
5080                                         ! defined $output_range_counts{$addr};
5081
5082         $aliases{$addr} = [ ];
5083         $comment{$addr} = [ ];
5084         $description{$addr} = [ ];
5085         $note{$addr} = [ ];
5086         $file_path{$addr} = [ ];
5087         $locked{$addr} = "";
5088
5089         push @{$description{$addr}}, $description if $description;
5090         push @{$note{$addr}}, $note if $note;
5091
5092         if ($fate{$addr} == $PLACEHOLDER) {
5093
5094             # A placeholder table doesn't get documented, is a perl extension,
5095             # and quite likely will be empty
5096             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5097             $perl_extension = 1 if ! defined $perl_extension;
5098             $ucd = 0 if ! defined $ucd;
5099             push @tables_that_may_be_empty, $complete_name{$addr};
5100             $self->add_comment(<<END);
5101 This is a placeholder because it is not in Version $string_version of Unicode,
5102 but is needed by the Perl core to work gracefully.  Because it is not in this
5103 version of Unicode, it will not be listed in $pod_file.pod
5104 END
5105         }
5106         elsif (exists $why_suppressed{$complete_name}
5107                 # Don't suppress if overridden
5108                 && ! grep { $_ eq $complete_name{$addr} }
5109                                                     @output_mapped_properties)
5110         {
5111             $fate{$addr} = $SUPPRESSED;
5112         }
5113         elsif ($fate{$addr} == $SUPPRESSED
5114                && ! exists $why_suppressed{$property{$addr}->complete_name})
5115         {
5116             Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
5117             # perhaps Fate => [ $SUPPRESSED, "reason" ]
5118         }
5119
5120         # If hasn't set its status already, see if it is on one of the
5121         # lists of properties or tables that have particular statuses; if
5122         # not, is normal.  The lists are prioritized so the most serious
5123         # ones are checked first
5124         if (! $status{$addr}) {
5125             if (exists $why_deprecated{$complete_name}) {
5126                 $status{$addr} = $DEPRECATED;
5127             }
5128             elsif (exists $why_stabilized{$complete_name}) {
5129                 $status{$addr} = $STABILIZED;
5130             }
5131             elsif (exists $why_obsolete{$complete_name}) {
5132                 $status{$addr} = $OBSOLETE;
5133             }
5134
5135             # Existence above doesn't necessarily mean there is a message
5136             # associated with it.  Use the most serious message.
5137             if ($status{$addr}) {
5138                 if ($why_deprecated{$complete_name}) {
5139                     $status_info{$addr}
5140                                 = $why_deprecated{$complete_name};
5141                 }
5142                 elsif ($why_stabilized{$complete_name}) {
5143                     $status_info{$addr}
5144                                 = $why_stabilized{$complete_name};
5145                 }
5146                 elsif ($why_obsolete{$complete_name}) {
5147                     $status_info{$addr}
5148                                 = $why_obsolete{$complete_name};
5149                 }
5150             }
5151         }
5152
5153         $perl_extension{$addr} = $perl_extension || 0;
5154
5155         # Don't list a property by default that is internal only
5156         if ($fate{$addr} > $MAP_PROXIED) {
5157             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5158             $ucd = 0 if ! defined $ucd;
5159         }
5160         else {
5161             $ucd = 1 if ! defined $ucd;
5162         }
5163
5164         # By convention what typically gets printed only or first is what's
5165         # first in the list, so put the full name there for good output
5166         # clarity.  Other routines rely on the full name being first on the
5167         # list
5168         $self->add_alias($full_name{$addr},
5169                             OK_as_Filename => $ok_as_filename,
5170                             Fuzzy => $loose_match,
5171                             Re_Pod_Entry => $make_re_pod_entry,
5172                             Status => $status{$addr},
5173                             UCD => $ucd,
5174                             );
5175
5176         # Then comes the other name, if meaningfully different.
5177         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5178             $self->add_alias($name{$addr},
5179                             OK_as_Filename => $ok_as_filename,
5180                             Fuzzy => $loose_match,
5181                             Re_Pod_Entry => $make_re_pod_entry,
5182                             Status => $status{$addr},
5183                             UCD => $ucd,
5184                             );
5185         }
5186
5187         return $self;
5188     }
5189
5190     # Here are the methods that are required to be defined by any derived
5191     # class
5192     for my $sub (qw(
5193                     handle_special_range
5194                     append_to_body
5195                     pre_body
5196                 ))
5197                 # write() knows how to write out normal ranges, but it calls
5198                 # handle_special_range() when it encounters a non-normal one.
5199                 # append_to_body() is called by it after it has handled all
5200                 # ranges to add anything after the main portion of the table.
5201                 # And finally, pre_body() is called after all this to build up
5202                 # anything that should appear before the main portion of the
5203                 # table.  Doing it this way allows things in the middle to
5204                 # affect what should appear before the main portion of the
5205                 # table.
5206     {
5207         no strict "refs";
5208         *$sub = sub {
5209             Carp::my_carp_bug( __LINE__
5210                               . ": Must create method '$sub()' for "
5211                               . ref shift);
5212             return;
5213         }
5214     }
5215
5216     use overload
5217         fallback => 0,
5218         "." => \&main::_operator_dot,
5219         ".=" => \&main::_operator_dot_equal,
5220         '!=' => \&main::_operator_not_equal,
5221         '==' => \&main::_operator_equal,
5222     ;
5223
5224     sub ranges {
5225         # Returns the array of ranges associated with this table.
5226
5227         no overloading;
5228         return $range_list{pack 'J', shift}->ranges;
5229     }
5230
5231     sub add_alias {
5232         # Add a synonym for this table.
5233
5234         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5235
5236         my $self = shift;
5237         my $name = shift;       # The name to add.
5238         my $pointer = shift;    # What the alias hash should point to.  For
5239                                 # map tables, this is the parent property;
5240                                 # for match tables, it is the table itself.
5241
5242         my %args = @_;
5243         my $loose_match = delete $args{'Fuzzy'};
5244
5245         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5246         $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
5247
5248         my $ok_as_filename = delete $args{'OK_as_Filename'};
5249         $ok_as_filename = 1 unless defined $ok_as_filename;
5250
5251         my $status = delete $args{'Status'};
5252         $status = $NORMAL unless defined $status;
5253
5254         # An internal name does not get documented, unless overridden by the
5255         # input.
5256         my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5257
5258         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5259
5260         # Capitalize the first letter of the alias unless it is one of the CJK
5261         # ones which specifically begins with a lower 'k'.  Do this because
5262         # Unicode has varied whether they capitalize first letters or not, and
5263         # have later changed their minds and capitalized them, but not the
5264         # other way around.  So do it always and avoid changes from release to
5265         # release
5266         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5267
5268         my $addr = do { no overloading; pack 'J', $self; };
5269
5270         # Figure out if should be loosely matched if not already specified.
5271         if (! defined $loose_match) {
5272
5273             # Is a loose_match if isn't null, and doesn't begin with an
5274             # underscore and isn't just a number
5275             if ($name ne ""
5276                 && substr($name, 0, 1) ne '_'
5277                 && $name !~ qr{^[0-9_.+-/]+$})
5278             {
5279                 $loose_match = 1;
5280             }
5281             else {
5282                 $loose_match = 0;
5283             }
5284         }
5285
5286         # If this alias has already been defined, do nothing.
5287         return if defined $find_table_from_alias{$addr}->{$name};
5288
5289         # That includes if it is standardly equivalent to an existing alias,
5290         # in which case, add this name to the list, so won't have to search
5291         # for it again.
5292         my $standard_name = main::standardize($name);
5293         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5294             $find_table_from_alias{$addr}->{$name}
5295                         = $find_table_from_alias{$addr}->{$standard_name};
5296             return;
5297         }
5298
5299         # Set the index hash for this alias for future quick reference.
5300         $find_table_from_alias{$addr}->{$name} = $pointer;
5301         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5302         local $to_trace = 0 if main::DEBUG;
5303         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5304         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5305
5306
5307         # Put the new alias at the end of the list of aliases unless the final
5308         # element begins with an underscore (meaning it is for internal perl
5309         # use) or is all numeric, in which case, put the new one before that
5310         # one.  This floats any all-numeric or underscore-beginning aliases to
5311         # the end.  This is done so that they are listed last in output lists,
5312         # to encourage the user to use a better name (either more descriptive
5313         # or not an internal-only one) instead.  This ordering is relied on
5314         # implicitly elsewhere in this program, like in short_name()
5315         my $list = $aliases{$addr};
5316         my $insert_position = (@$list == 0
5317                                 || (substr($list->[-1]->name, 0, 1) ne '_'
5318                                     && $list->[-1]->name =~ /\D/))
5319                             ? @$list
5320                             : @$list - 1;
5321         splice @$list,
5322                 $insert_position,
5323                 0,
5324                 Alias->new($name, $loose_match, $make_re_pod_entry,
5325                                                 $ok_as_filename, $status, $ucd);
5326
5327         # This name may be shorter than any existing ones, so clear the cache
5328         # of the shortest, so will have to be recalculated.
5329         no overloading;
5330         undef $short_name{pack 'J', $self};
5331         return;
5332     }
5333
5334     sub short_name {
5335         # Returns a name suitable for use as the base part of a file name.
5336         # That is, shorter wins.  It can return undef if there is no suitable
5337         # name.  The name has all non-essential underscores removed.
5338
5339         # The optional second parameter is a reference to a scalar in which
5340         # this routine will store the length the returned name had before the
5341         # underscores were removed, or undef if the return is undef.
5342
5343         # The shortest name can change if new aliases are added.  So using
5344         # this should be deferred until after all these are added.  The code
5345         # that does that should clear this one's cache.
5346         # Any name with alphabetics is preferred over an all numeric one, even
5347         # if longer.
5348
5349         my $self = shift;
5350         my $nominal_length_ptr = shift;
5351         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5352
5353         my $addr = do { no overloading; pack 'J', $self; };
5354
5355         # For efficiency, don't recalculate, but this means that adding new
5356         # aliases could change what the shortest is, so the code that does
5357         # that needs to undef this.
5358         if (defined $short_name{$addr}) {
5359             if ($nominal_length_ptr) {
5360                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5361             }
5362             return $short_name{$addr};
5363         }
5364
5365         # Look at each alias
5366         foreach my $alias ($self->aliases()) {
5367
5368             # Don't use an alias that isn't ok to use for an external name.
5369             next if ! $alias->ok_as_filename;
5370
5371             my $name = main::Standardize($alias->name);
5372             trace $self, $name if main::DEBUG && $to_trace;
5373
5374             # Take the first one, or a shorter one that isn't numeric.  This
5375             # relies on numeric aliases always being last in the array
5376             # returned by aliases().  Any alpha one will have precedence.
5377             if (! defined $short_name{$addr}
5378                 || ($name =~ /\D/
5379                     && length($name) < length($short_name{$addr})))
5380             {
5381                 # Remove interior underscores.
5382                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5383
5384                 $nominal_short_name_length{$addr} = length $name;
5385             }
5386         }
5387
5388         # If the short name isn't a nice one, perhaps an equivalent table has
5389         # a better one.
5390         if (! defined $short_name{$addr}
5391             || $short_name{$addr} eq ""
5392             || $short_name{$addr} eq "_")
5393         {
5394             my $return;
5395             foreach my $follower ($self->children) {    # All equivalents
5396                 my $follower_name = $follower->short_name;
5397                 next unless defined $follower_name;
5398
5399                 # Anything (except undefined) is better than underscore or
5400                 # empty
5401                 if (! defined $return || $return eq "_") {
5402                     $return = $follower_name;
5403                     next;
5404                 }
5405
5406                 # If the new follower name isn't "_" and is shorter than the
5407                 # current best one, prefer the new one.
5408                 next if $follower_name eq "_";
5409                 next if length $follower_name > length $return;
5410                 $return = $follower_name;
5411             }
5412             $short_name{$addr} = $return if defined $return;
5413         }
5414
5415         # If no suitable external name return undef
5416         if (! defined $short_name{$addr}) {
5417             $$nominal_length_ptr = undef if $nominal_length_ptr;
5418             return;
5419         }
5420
5421         # Don't allow a null short name.
5422         if ($short_name{$addr} eq "") {
5423             $short_name{$addr} = '_';
5424             $nominal_short_name_length{$addr} = 1;
5425         }
5426
5427         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5428
5429         if ($nominal_length_ptr) {
5430             $$nominal_length_ptr = $nominal_short_name_length{$addr};
5431         }
5432         return $short_name{$addr};
5433     }
5434
5435     sub external_name {
5436         # Returns the external name that this table should be known by.  This
5437         # is usually the short_name, but not if the short_name is undefined,
5438         # in which case the external_name is arbitrarily set to the
5439         # underscore.
5440
5441         my $self = shift;
5442         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5443
5444         my $short = $self->short_name;
5445         return $short if defined $short;
5446
5447         return '_';
5448     }
5449
5450     sub add_description { # Adds the parameter as a short description.
5451
5452         my $self = shift;
5453         my $description = shift;
5454         chomp $description;
5455         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5456
5457         no overloading;
5458         push @{$description{pack 'J', $self}}, $description;
5459
5460         return;
5461     }
5462
5463     sub add_note { # Adds the parameter as a short note.
5464
5465         my $self = shift;
5466         my $note = shift;
5467         chomp $note;
5468         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5469
5470         no overloading;
5471         push @{$note{pack 'J', $self}}, $note;
5472
5473         return;
5474     }
5475
5476     sub add_comment { # Adds the parameter as a comment.
5477
5478         return unless $debugging_build;
5479
5480         my $self = shift;
5481         my $comment = shift;
5482         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5483
5484         chomp $comment;
5485
5486         no overloading;
5487         push @{$comment{pack 'J', $self}}, $comment;
5488
5489         return;
5490     }
5491
5492     sub comment {
5493         # Return the current comment for this table.  If called in list
5494         # context, returns the array of comments.  In scalar, returns a string
5495         # of each element joined together with a period ending each.
5496
5497         my $self = shift;
5498         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5499
5500         my $addr = do { no overloading; pack 'J', $self; };
5501         my @list = @{$comment{$addr}};
5502         return @list if wantarray;
5503         my $return = "";
5504         foreach my $sentence (@list) {
5505             $return .= '.  ' if $return;
5506             $return .= $sentence;
5507             $return =~ s/\.$//;
5508         }
5509         $return .= '.' if $return;
5510         return $return;
5511     }
5512
5513     sub initialize {
5514         # Initialize the table with the argument which is any valid
5515         # initialization for range lists.
5516
5517         my $self = shift;
5518         my $addr = do { no overloading; pack 'J', $self; };
5519         my $initialization = shift;
5520         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5521
5522         # Replace the current range list with a new one of the same exact
5523         # type.
5524         my $class = ref $range_list{$addr};
5525         $range_list{$addr} = $class->new(Owner => $self,
5526                                         Initialize => $initialization);
5527         return;
5528
5529     }
5530
5531     sub header {
5532         # The header that is output for the table in the file it is written
5533         # in.
5534
5535         my $self = shift;
5536         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5537
5538         my $return = "";
5539         $return .= $DEVELOPMENT_ONLY if $compare_versions;
5540         $return .= $HEADER;
5541         return $return;
5542     }
5543
5544     sub merge_single_annotation_line ($$$) {
5545         my ($output, $annotation, $annotation_column) = @_;
5546
5547         # This appends an annotation comment, $annotation, to $output,
5548         # starting in or after column $annotation_column, removing any
5549         # pre-existing comment from $output.
5550
5551         $annotation =~ s/^ \s* \# \  //x;
5552         $output =~ s/ \s* ( \# \N* )? \n //x;
5553         $output = Text::Tabs::expand($output);
5554
5555         my $spaces = $annotation_column - length $output;
5556         $spaces = 2 if $spaces < 0;  # Have 2 blanks before the comment
5557
5558         $output = sprintf "%s%*s# %s",
5559                             $output,
5560                             $spaces,
5561                             " ",
5562                             $annotation;
5563         return Text::Tabs::unexpand $output;
5564     }
5565
5566     sub write {
5567         # Write a representation of the table to its file.  It calls several
5568         # functions furnished by sub-classes of this abstract base class to
5569         # handle non-normal ranges, to add stuff before the table, and at its
5570         # end.  If the table is to be written so that adjustments are
5571         # required, this does that conversion.
5572
5573         my $self = shift;
5574         my $use_adjustments = shift; # ? output in adjusted format or not
5575         my $suppress_value = shift;  # Optional, if the value associated with
5576                                      # a range equals this one, don't write
5577                                      # the range
5578         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5579
5580         my $addr = do { no overloading; pack 'J', $self; };
5581         my $write_as_invlist = $write_as_invlist{$addr};
5582
5583         # Start with the header
5584         my @HEADER = $self->header;
5585
5586         # Then the comments
5587         push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5588                                                         if $comment{$addr};
5589
5590         # Things discovered processing the main body of the document may
5591         # affect what gets output before it, therefore pre_body() isn't called
5592         # until after all other processing of the table is done.
5593
5594         # The main body looks like a 'here' document.  If there are comments,
5595         # get rid of them when processing it.
5596         my @OUT;
5597         if ($annotate || $output_range_counts) {
5598             # Use the line below in Perls that don't have /r
5599             #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5600             push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5601         } else {
5602             push @OUT, "return <<'END';\n";
5603         }
5604
5605         if ($range_list{$addr}->is_empty) {
5606
5607             # This is a kludge for empty tables to silence a warning in
5608             # utf8.c, which can't really deal with empty tables, but it can
5609             # deal with a table that matches nothing, as the inverse of 'All'
5610             # does.
5611             push @OUT, "!utf8::All\n";
5612         }
5613         elsif ($self->name eq 'N'
5614
5615                # To save disk space and table cache space, avoid putting out
5616                # binary N tables, but instead create a file which just inverts
5617                # the Y table.  Since the file will still exist and occupy a
5618                # certain number of blocks, might as well output the whole
5619                # thing if it all will fit in one block.   The number of
5620                # ranges below is an approximate number for that.
5621                && ($self->property->type == $BINARY
5622                    || $self->property->type == $FORCED_BINARY)
5623                # && $self->property->tables == 2  Can't do this because the
5624                #        non-binary properties, like NFDQC aren't specifiable
5625                #        by the notation
5626                && $range_list{$addr}->ranges > 15
5627                && ! $annotate)  # Under --annotate, want to see everything
5628         {
5629             push @OUT, "!utf8::" . $self->property->name . "\n";
5630         }
5631         else {
5632             my $range_size_1 = $range_size_1{$addr};
5633
5634             # To make it more readable, use a minimum indentation
5635             my $comment_indent;
5636
5637             # These are used only in $annotate option
5638             my $format;         # e.g. $HEX_ADJUST_FORMAT
5639             my $include_name;   # ? Include the character's name in the
5640                                 # annotation?
5641             my $include_cp;     # ? Include its code point
5642
5643             if (! $annotate) {
5644                 $comment_indent = ($self->isa('Map_Table'))
5645                                   ? 24
5646                                   : ($write_as_invlist)
5647                                     ? 8
5648                                     : 16;
5649             }
5650             else {
5651                 $format = $self->format;
5652
5653                 # The name of the character is output only for tables that
5654                 # don't already include the name in the output.
5655                 my $property = $self->property;
5656                 $include_name =
5657                     !  ($property == $perl_charname
5658                         || $property == main::property_ref('Unicode_1_Name')
5659                         || $property == main::property_ref('Name')
5660                         || $property == main::property_ref('Name_Alias')
5661                        );
5662
5663                 # Don't include the code point in the annotation where all
5664                 # lines are a single code point, so it can be easily found in
5665                 # the first column
5666                 $include_cp = ! $range_size_1;
5667
5668                 if (! $self->isa('Map_Table')) {
5669                     $comment_indent = ($write_as_invlist) ? 8 : 16;
5670                 }
5671                 else {
5672                     $comment_indent = 16;
5673
5674                     # There are just a few short ranges in this table, so no
5675                     # need to include the code point in the annotation.
5676                     $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
5677
5678                     # We're trying to get this to look good, as the whole
5679                     # point is to make human-readable tables.  It is easier to
5680                     # read if almost all the annotation comments begin in the
5681                     # same column.  Map tables have varying width maps, so can
5682                     # create a jagged comment appearance.  This code does a
5683                     # preliminary pass through these tables looking for the
5684                     # maximum width map in each, and causing the comments to
5685                     # begin just to the right of that.  However, if the
5686                     # comments begin too far to the right of most lines, it's
5687                     # hard to line them up horizontally with their real data.
5688                     # Therefore we ignore the longest outliers
5689                     my $ignore_longest_X_percent = 2;  # Discard longest X%
5690
5691                     # Each key in this hash is a width of at least one of the
5692                     # maps in the table.  Its value is how many lines have
5693                     # that width.
5694                     my %widths;
5695
5696                     # We won't space things further left than one tab stop
5697                     # after the rest of the line; initializing it to that
5698                     # number saves some work.
5699                     my $max_map_width = 8;
5700
5701                     # Fill in the %widths hash
5702                     my $total = 0;
5703                     for my $set ($range_list{$addr}->ranges) {
5704                         my $value = $set->value;
5705
5706                         # These range types don't appear in the main table
5707                         next if $set->type == 0
5708                                 && defined $suppress_value
5709                                 && $value eq $suppress_value;
5710                         next if $set->type == $MULTI_CP
5711                                 || $set->type == $NULL;
5712
5713                         # Include 2 spaces before the beginning of the
5714                         # comment
5715                         my $this_width = length($value) + 2;
5716
5717                         # Ranges of the remaining non-zero types usually
5718                         # occupy just one line (maybe occasionally two, but
5719                         # this doesn't have to be dead accurate).  This is
5720                         # because these ranges are like "unassigned code
5721                         # points"
5722                         my $count = ($set->type != 0)
5723                                     ? 1
5724                                     : $set->end - $set->start + 1;
5725                         $widths{$this_width} += $count;
5726                         $total += $count;
5727                         $max_map_width = $this_width
5728                                             if $max_map_width < $this_width;
5729                     }
5730
5731                     # If the widest map gives us less than two tab stops
5732                     # worth, just take it as-is.
5733                     if ($max_map_width > 16) {
5734
5735                         # Otherwise go through %widths until we have included
5736                         # the desired percentage of lines in the whole table.
5737                         my $running_total = 0;
5738                         foreach my $width (sort { $a <=> $b } keys %widths)
5739                         {
5740                             $running_total += $widths{$width};
5741                             use integer;
5742                             if ($running_total * 100 / $total
5743                                             >= 100 - $ignore_longest_X_percent)
5744                             {
5745                                 $max_map_width = $width;
5746                                 last;
5747                             }
5748                         }
5749                     }
5750                     $comment_indent += $max_map_width;
5751                 }
5752             }
5753
5754             # Values for previous time through the loop.  Initialize to
5755             # something that won't be adjacent to the first iteration;
5756             # only $previous_end matters for that.
5757             my $previous_start;
5758             my $previous_end = -2;
5759             my $previous_value;
5760
5761             # Values for next time through the portion of the loop that splits
5762             # the range.  0 in $next_start means there is no remaining portion
5763             # to deal with.
5764             my $next_start = 0;
5765             my $next_end;
5766             my $next_value;
5767             my $offset = 0;
5768             my $invlist_count = 0;
5769
5770             my $output_value_in_hex = $self->isa('Map_Table')
5771                                 && ($self->format eq $HEX_ADJUST_FORMAT
5772                                     || $self->to_output_map == $EXTERNAL_MAP);
5773             # Use leading zeroes just for files whose format should not be
5774             # changed from what it has been.  Otherwise, they just take up
5775             # space and time to process.
5776             my $hex_format = ($self->isa('Map_Table')
5777                               && $self->to_output_map == $EXTERNAL_MAP)
5778                              ? "%04X"
5779                              : "%X";
5780
5781             # The values for some of these tables are stored in mktables as
5782             # hex strings.  Normally, these are just output as strings without
5783             # change, but when we are doing adjustments, we have to operate on
5784             # these numerically, so we convert those to decimal to do that,
5785             # and back to hex for output
5786             my $convert_map_to_from_hex = 0;
5787             my $output_map_in_hex = 0;
5788             if ($self->isa('Map_Table')) {
5789                 $convert_map_to_from_hex
5790                    = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
5791                       || ($annotate && $self->format eq $HEX_FORMAT);
5792                 $output_map_in_hex = $convert_map_to_from_hex
5793                                  || $self->format eq $HEX_FORMAT;
5794             }
5795
5796             # To store any annotations about the characters.
5797             my @annotation;
5798
5799             # Output each range as part of the here document.
5800             RANGE:
5801             for my $set ($range_list{$addr}->ranges) {
5802                 if ($set->type != 0) {
5803                     $self->handle_special_range($set);
5804                     next RANGE;
5805                 }
5806                 my $start = $set->start;
5807                 my $end   = $set->end;
5808                 my $value  = $set->value;
5809
5810                 # Don't output ranges whose value is the one to suppress
5811                 next RANGE if defined $suppress_value
5812                               && $value eq $suppress_value;
5813
5814                 $value = CORE::hex $value if $convert_map_to_from_hex;
5815
5816
5817                 {   # This bare block encloses the scope where we may need to
5818                     # 'redo' to.  Consider a table that is to be written out
5819                     # using single item ranges.  This is given in the
5820                     # $range_size_1 boolean.  To accomplish this, we split the
5821                     # range each time through the loop into two portions, the
5822                     # first item, and the rest.  We handle that first item
5823                     # this time in the loop, and 'redo' to repeat the process
5824                     # for the rest of the range.
5825                     #
5826                     # We may also have to do it, with other special handling,
5827                     # if the table has adjustments.  Consider the table that
5828                     # contains the lowercasing maps.  mktables stores the
5829                     # ASCII range ones as 26 ranges:
5830                     #       ord('A') => ord('a'), .. ord('Z') => ord('z')
5831                     # For compactness, the table that gets written has this as
5832                     # just one range
5833                     #       ( ord('A') .. ord('Z') ) => ord('a')
5834                     # and the software that reads the tables is smart enough
5835                     # to "connect the dots".  This change is accomplished in
5836                     # this loop by looking to see if the current iteration
5837                     # fits the paradigm of the previous iteration, and if so,
5838                     # we merge them by replacing the final output item with
5839                     # the merged data.  Repeated 25 times, this gets A-Z.  But
5840                     # we also have to make sure we don't screw up cases where
5841                     # we have internally stored
5842                     #       ( 0x1C4 .. 0x1C6 ) => 0x1C5
5843                     # This single internal range has to be output as 3 ranges,
5844                     # which is done by splitting, like we do for $range_size_1
5845                     # tables.  (There are very few of such ranges that need to
5846                     # be split, so the gain of doing the combining of other
5847                     # ranges far outweighs the splitting of these.)  The
5848                     # values to use for the redo at the end of this block are
5849                     # set up just below in the scalars whose names begin with
5850                     # '$next_'.
5851
5852                     if (($use_adjustments || $range_size_1) && $end != $start)
5853                     {
5854                         $next_start = $start + 1;
5855                         $next_end = $end;
5856                         $next_value = $value;
5857                         $end = $start;
5858                     }
5859
5860                     if ($use_adjustments && ! $range_size_1) {
5861
5862                         # If this range is adjacent to the previous one, and
5863                         # the values in each are integers that are also
5864                         # adjacent (differ by 1), then this range really
5865                         # extends the previous one that is already in element
5866                         # $OUT[-1].  So we pop that element, and pretend that
5867                         # the range starts with whatever it started with.
5868                         # $offset is incremented by 1 each time so that it
5869                         # gives the current offset from the first element in
5870                         # the accumulating range, and we keep in $value the
5871                         # value of that first element.
5872                         if ($start == $previous_end + 1
5873                             && $value =~ /^ -? \d+ $/xa
5874                             && $previous_value =~ /^ -? \d+ $/xa
5875                             && ($value == ($previous_value + ++$offset)))
5876                         {
5877                             pop @OUT;
5878                             $start = $previous_start;
5879                             $value = $previous_value;
5880                         }
5881                         else {
5882                             $offset = 0;
5883                             if (@annotation == 1) {
5884                                 $OUT[-1] = merge_single_annotation_line(
5885                                     $OUT[-1], $annotation[0], $comment_indent);
5886                             }
5887                             else {
5888                                 push @OUT, @annotation;
5889                             }
5890                         }
5891                         undef @annotation;
5892
5893                         # Save the current values for the next time through
5894                         # the loop.
5895                         $previous_start = $start;
5896                         $previous_end = $end;
5897                         $previous_value = $value;
5898                     }
5899
5900                     if ($write_as_invlist) {
5901
5902                         # Inversion list format has a single number per line,
5903                         # the starting code point of a range that matches the
5904                         # property
5905                         push @OUT, $start, "\n";
5906                         $invlist_count++;
5907
5908                         # Add a comment with the size of the range, if
5909                         # requested.
5910                         if ($output_range_counts{$addr}) {
5911                             $OUT[-1] = merge_single_annotation_line(
5912                                     $OUT[-1],
5913                                     "# ["
5914                                       . main::clarify_code_point_count($end - $start + 1)
5915                                       . "]\n",
5916                                     $comment_indent);
5917                         }
5918                     }
5919                     elsif ($start != $end) { # If there is a range
5920                         if ($end == $MAX_WORKING_CODEPOINT) {
5921                             push @OUT, sprintf "$hex_format\t$hex_format",
5922                                                 $start,
5923                                                 $MAX_PLATFORM_CODEPOINT;
5924                         }
5925                         else {
5926                             push @OUT, sprintf "$hex_format\t$hex_format",
5927                                                 $start,       $end;
5928                         }
5929                         if (length $value) {
5930                             if ($convert_map_to_from_hex) {
5931                                 $OUT[-1] .= sprintf "\t$hex_format\n", $value;
5932                             }
5933                             else {
5934                                 $OUT[-1] .= "\t$value\n";
5935                             }
5936                         }
5937
5938                         # Add a comment with the size of the range, if
5939                         # requested.
5940                         if ($output_range_counts{$addr}) {
5941                             $OUT[-1] = merge_single_annotation_line(
5942                                     $OUT[-1],
5943                                     "# ["
5944                                       . main::clarify_code_point_count($end - $start + 1)
5945                                       . "]\n",
5946                                     $comment_indent);
5947                         }
5948                     }
5949                     else { # Here to output a single code point per line.
5950
5951                         # Use any passed in subroutine to output.
5952                         if (ref $range_size_1 eq 'CODE') {
5953                             for my $i ($start .. $end) {
5954                                 push @OUT, &{$range_size_1}($i, $value);
5955                             }
5956                         }
5957                         else {
5958
5959                             # Here, caller is ok with default output.
5960                             for (my $i = $start; $i <= $end; $i++) {
5961                                 if ($convert_map_to_from_hex) {
5962                                     push @OUT,
5963                                         sprintf "$hex_format\t\t$hex_format\n",
5964                                                  $i,            $value;
5965                                 }
5966                                 else {
5967                                     push @OUT, sprintf $hex_format, $i;
5968                                     $OUT[-1] .= "\t\t$value" if $value ne "";
5969                                     $OUT[-1] .= "\n";
5970                                 }
5971                             }
5972                         }
5973                     }
5974
5975                     if ($annotate) {
5976                         for (my $i = $start; $i <= $end; $i++) {
5977                             my $annotation = "";
5978
5979                             # Get character information if don't have it already
5980                             main::populate_char_info($i)
5981                                                      if ! defined $viacode[$i];
5982                             my $type = $annotate_char_type[$i];
5983
5984                             # Figure out if should output the next code points
5985                             # as part of a range or not.  If this is not in an
5986                             # annotation range, then won't output as a range,
5987                             # so returns $i.  Otherwise use the end of the
5988                             # annotation range, but no further than the
5989                             # maximum possible end point of the loop.
5990                             my $range_end =
5991                                         $range_size_1
5992                                         ? $start
5993                                         : main::min(
5994                                           $annotate_ranges->value_of($i) || $i,
5995                                           $end);
5996
5997                             # Use a range if it is a range, and either is one
5998                             # of the special annotation ranges, or the range
5999                             # is at most 3 long.  This last case causes the
6000                             # algorithmically named code points to be output
6001                             # individually in spans of at most 3, as they are
6002                             # the ones whose $type is > 0.
6003                             if ($range_end != $i
6004                                 && ( $type < 0 || $range_end - $i > 2))
6005                             {
6006                                 # Here is to output a range.  We don't allow a
6007                                 # caller-specified output format--just use the
6008                                 # standard one.
6009                                 my $range_name = $viacode[$i];
6010
6011                                 # For the code points which end in their hex
6012                                 # value, we eliminate that from the output
6013                                 # annotation, and capitalize only the first
6014                                 # letter of each word.
6015                                 if ($type == $CP_IN_NAME) {
6016                                     my $hex = sprintf $hex_format, $i;
6017                                     $range_name =~ s/-$hex$//;
6018                                     my @words = split " ", $range_name;
6019                                     for my $word (@words) {
6020                                         $word =
6021                                           ucfirst(lc($word)) if $word ne 'CJK';
6022                                     }
6023                                     $range_name = join " ", @words;
6024                                 }
6025                                 elsif ($type == $HANGUL_SYLLABLE) {
6026                                     $range_name = "Hangul Syllable";
6027                                 }
6028
6029                                 if ($i != $start || $range_end < $end) {
6030                                     if ($range_end < $MAX_WORKING_CODEPOINT)
6031                                     {
6032                                         $annotation = sprintf "%04X..%04X",
6033                                                               $i,   $range_end;
6034                                     }
6035                                     else {
6036                                         $annotation = sprintf "%04X..INFINITY",
6037                                                                $i;
6038                                     }
6039                                 }
6040                                 else { # Indent if not displaying code points
6041                                     $annotation = " " x 4;
6042                                 }
6043                                 $annotation .= " $range_name" if $range_name;
6044
6045                                 # Include the number of code points in the
6046                                 # range
6047                                 my $count =
6048                                     main::clarify_code_point_count($range_end - $i + 1);
6049                                 $annotation .= " [$count]\n";
6050
6051                                 # Skip to the end of the range
6052                                 $i = $range_end;
6053                             }
6054                             else { # Not in a range.
6055                                 my $comment = "";
6056
6057                                 # When outputting the names of each character,
6058                                 # use the character itself if printable
6059                                 $comment .= "'" . main::display_chr($i) . "' "
6060                                                             if $printable[$i];
6061
6062                                 my $output_value = $value;
6063
6064                                 # Determine the annotation
6065                                 if ($format eq $DECOMP_STRING_FORMAT) {
6066
6067                                     # This is very specialized, with the type
6068                                     # of decomposition beginning the line
6069                                     # enclosed in <...>, and the code points
6070                                     # that the code point decomposes to
6071                                     # separated by blanks.  Create two
6072                                     # strings, one of the printable
6073                                     # characters, and one of their official
6074                                     # names.
6075                                     (my $map = $output_value)
6076                                                     =~ s/ \ * < .*? > \ +//x;
6077                                     my $tostr = "";
6078                                     my $to_name = "";
6079                                     my $to_chr = "";
6080                                     foreach my $to (split " ", $map) {
6081                                         $to = CORE::hex $to;
6082                                         $to_name .= " + " if $to_name;
6083                                         $to_chr .= main::display_chr($to);
6084                                         main::populate_char_info($to)
6085                                                     if ! defined $viacode[$to];
6086                                         $to_name .=  $viacode[$to];
6087                                     }
6088
6089                                     $comment .=
6090                                     "=> '$to_chr'; $viacode[$i] => $to_name";
6091                                 }
6092                                 else {
6093                                     $output_value += $i - $start
6094                                                    if $use_adjustments
6095                                                       # Don't try to adjust a
6096                                                       # non-integer
6097                                                    && $output_value !~ /[-\D]/;
6098
6099                                     if ($output_map_in_hex) {
6100                                         main::populate_char_info($output_value)
6101                                           if ! defined $viacode[$output_value];
6102                                         $comment .= " => '"
6103                                         . main::display_chr($output_value)
6104                                         . "'; " if $printable[$output_value];
6105                                     }
6106                                     if ($include_name && $viacode[$i]) {
6107                                         $comment .= " " if $comment;
6108                                         $comment .= $viacode[$i];
6109                                     }
6110                                     if ($output_map_in_hex) {
6111                                         $comment .=
6112                                                 " => $viacode[$output_value]"
6113                                                     if $viacode[$output_value];
6114                                         $output_value = sprintf($hex_format,
6115                                                                 $output_value);
6116                                     }
6117                                 }
6118
6119                                 if ($include_cp) {
6120                                     $annotation = sprintf "%04X", $i;
6121                                     if ($use_adjustments) {
6122                                         $annotation .= " => $output_value";
6123                                     }
6124                                 }
6125
6126                                 if ($comment ne "") {
6127                                     $annotation .= " " if $annotation ne "";
6128                                     $annotation .= $comment;
6129                                 }
6130                                 $annotation .= "\n" if $annotation ne "";
6131                             }
6132
6133                             if ($annotation ne "") {
6134                                 push @annotation, (" " x $comment_indent)
6135                                                   .  "# $annotation";
6136                             }
6137                         }
6138
6139                         # If not adjusting, we don't have to go through the
6140                         # loop again to know that the annotation comes next
6141                         # in the output.
6142                         if (! $use_adjustments) {
6143                             if (@annotation == 1) {
6144                                 $OUT[-1] = merge_single_annotation_line(
6145                                     $OUT[-1], $annotation[0], $comment_indent);
6146                             }
6147                             else {
6148                                 push @OUT, map { Text::Tabs::unexpand $_ }
6149                                                @annotation;
6150                             }
6151                             undef @annotation;
6152                         }
6153                     }
6154
6155                     # Add the beginning of the range that doesn't match the
6156                     # property, except if the just added match range extends
6157                     # to infinity.  We do this after any annotations for the
6158                     # match range.
6159                     if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6160                         push @OUT, $end + 1, "\n";
6161                         $invlist_count++;
6162                     }
6163
6164                     # If we split the range, set up so the next time through
6165                     # we get the remainder, and redo.
6166                     if ($next_start) {
6167                         $start = $next_start;
6168                         $end = $next_end;
6169                         $value = $next_value;
6170                         $next_start = 0;
6171                         redo;
6172                     }
6173                 }
6174             } # End of loop through all the table's ranges
6175
6176             push @OUT, @annotation; # Add orphaned annotation, if any
6177
6178             splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6179         }
6180
6181         # Add anything that goes after the main body, but within the here
6182         # document,
6183         my $append_to_body = $self->append_to_body;
6184         push @OUT, $append_to_body if $append_to_body;
6185
6186         # And finish the here document.
6187         push @OUT, "END\n";
6188
6189         # Done with the main portion of the body.  Can now figure out what
6190         # should appear before it in the file.
6191         my $pre_body = $self->pre_body;
6192         push @HEADER, $pre_body, "\n" if $pre_body;
6193
6194         # All these files should have a .pl suffix added to them.
6195         my @file_with_pl = @{$file_path{$addr}};
6196         $file_with_pl[-1] .= '.pl';
6197
6198         main::write(\@file_with_pl,
6199                     $annotate,      # utf8 iff annotating
6200                     \@HEADER,
6201                     \@OUT);
6202         return;
6203     }
6204
6205     sub set_status {    # Set the table's status
6206         my $self = shift;
6207         my $status = shift; # The status enum value
6208         my $info = shift;   # Any message associated with it.
6209         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6210
6211         my $addr = do { no overloading; pack 'J', $self; };
6212
6213         $status{$addr} = $status;
6214         $status_info{$addr} = $info;
6215         return;
6216     }
6217
6218     sub set_fate {  # Set the fate of a table
6219         my $self = shift;
6220         my $fate = shift;
6221         my $reason = shift;
6222         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6223
6224         my $addr = do { no overloading; pack 'J', $self; };
6225
6226         return if $fate{$addr} == $fate;    # If no-op
6227
6228         # Can only change the ordinary fate, except if going to $MAP_PROXIED
6229         return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6230
6231         $fate{$addr} = $fate;
6232
6233         # Don't document anything to do with a non-normal fated table
6234         if ($fate != $ORDINARY) {
6235             my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6236             foreach my $alias ($self->aliases) {
6237                 $alias->set_ucd($put_in_pod);
6238
6239                 # MAP_PROXIED doesn't affect the match tables
6240                 next if $fate == $MAP_PROXIED;
6241                 $alias->set_make_re_pod_entry($put_in_pod);
6242             }
6243         }
6244
6245         # Save the reason for suppression for output
6246         if ($fate == $SUPPRESSED && defined $reason) {
6247             $why_suppressed{$complete_name{$addr}} = $reason;
6248         }
6249
6250         return;
6251     }
6252
6253     sub lock {
6254         # Don't allow changes to the table from now on.  This stores a stack
6255         # trace of where it was called, so that later attempts to modify it
6256         # can immediately show where it got locked.
6257
6258         my $self = shift;
6259         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6260
6261         my $addr = do { no overloading; pack 'J', $self; };
6262
6263         $locked{$addr} = "";
6264
6265         my $line = (caller(0))[2];
6266         my $i = 1;
6267
6268         # Accumulate the stack trace
6269         while (1) {
6270             my ($pkg, $file, $caller_line, $caller) = caller $i++;
6271
6272             last unless defined $caller;
6273
6274             $locked{$addr} .= "    called from $caller() at line $line\n";
6275             $line = $caller_line;
6276         }
6277         $locked{$addr} .= "    called from main at line $line\n";
6278
6279         return;
6280     }
6281
6282     sub carp_if_locked {
6283         # Return whether a table is locked or not, and, by the way, complain
6284         # if is locked
6285
6286         my $self = shift;
6287         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6288
6289         my $addr = do { no overloading; pack 'J', $self; };
6290
6291         return 0 if ! $locked{$addr};
6292         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6293         return 1;
6294     }
6295
6296     sub set_file_path { # Set the final directory path for this table
6297         my $self = shift;
6298         # Rest of parameters passed on
6299
6300         no overloading;
6301         @{$file_path{pack 'J', $self}} = @_;
6302         return
6303     }
6304
6305     # Accessors for the range list stored in this table.  First for
6306     # unconditional
6307     for my $sub (qw(
6308                     containing_range
6309                     contains
6310                     count
6311                     each_range
6312                     hash
6313                     is_empty
6314                     matches_identically_to
6315                     max
6316                     min
6317                     range_count
6318                     reset_each_range
6319                     type_of
6320                     value_of
6321                 ))
6322     {
6323         no strict "refs";
6324         *$sub = sub {
6325             use strict "refs";
6326             my $self = shift;
6327             return $self->_range_list->$sub(@_);
6328         }
6329     }
6330
6331     # Then for ones that should fail if locked
6332     for my $sub (qw(
6333                     delete_range
6334                 ))
6335     {
6336         no strict "refs";
6337         *$sub = sub {
6338             use strict "refs";
6339             my $self = shift;
6340
6341             return if $self->carp_if_locked;
6342             no overloading;
6343             return $self->_range_list->$sub(@_);
6344         }
6345     }
6346
6347 } # End closure
6348
6349 package Map_Table;
6350 use parent '-norequire', '_Base_Table';
6351
6352 # A Map Table is a table that contains the mappings from code points to
6353 # values.  There are two weird cases:
6354 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6355 #    are written in the table's file at the end of the table nonetheless.  It
6356 #    requires specially constructed code to handle these; utf8.c can not read
6357 #    these in, so they should not go in $map_directory.  As of this writing,
6358 #    the only case that these happen is for named sequences used in
6359 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
6360 #    something else could come along that uses it.
6361 # 2) Specials are anything that doesn't fit syntactically into the body of the
6362 #    table.  The ranges for these have a map type of non-zero.  The code below
6363 #    knows about and handles each possible type.   In most cases, these are
6364 #    written as part of the header.
6365 #
6366 # A map table deliberately can't be manipulated at will unlike match tables.
6367 # This is because of the ambiguities having to do with what to do with
6368 # overlapping code points.  And there just isn't a need for those things;
6369 # what one wants to do is just query, add, replace, or delete mappings, plus
6370 # write the final result.
6371 # However, there is a method to get the list of possible ranges that aren't in
6372 # this table to use for defaulting missing code point mappings.  And,
6373 # map_add_or_replace_non_nulls() does allow one to add another table to this
6374 # one, but it is clearly very specialized, and defined that the other's
6375 # non-null values replace this one's if there is any overlap.
6376
6377 sub trace { return main::trace(@_); }
6378
6379 { # Closure
6380
6381     main::setup_package();
6382
6383     my %default_map;
6384     # Many input files omit some entries; this gives what the mapping for the
6385     # missing entries should be
6386     main::set_access('default_map', \%default_map, 'r');
6387
6388     my %anomalous_entries;
6389     # Things that go in the body of the table which don't fit the normal
6390     # scheme of things, like having a range.  Not much can be done with these
6391     # once there except to output them.  This was created to handle named
6392     # sequences.
6393     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6394     main::set_access('anomalous_entries',       # Append singular, read plural
6395                     \%anomalous_entries,
6396                     'readable_array');
6397
6398     my %replacement_property;
6399     # Certain files are unused by Perl itself, and are kept only for backwards
6400     # compatibility for programs that used them before Unicode::UCD existed.
6401     # These are termed legacy properties.  At some point they may be removed,
6402     # but for now mark them as legacy.  If non empty, this is the name of the
6403     # property to use instead (i.e., the modern equivalent).
6404     main::set_access('replacement_property', \%replacement_property, 'r');
6405
6406     my %to_output_map;
6407     # Enum as to whether or not to write out this map table, and how:
6408     #   0               don't output
6409     #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
6410     #                   it should not be removed nor its format changed.  This
6411     #                   is done for those files that have traditionally been
6412     #                   output.  Maps of legacy-only properties default to
6413     #                   this.
6414     #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
6415     #                   with this file
6416     #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6417     #                   outputting the actual mappings as-is, we adjust things
6418     #                   to create a much more compact table. Only those few
6419     #                   tables where the mapping is convertible at least to an
6420     #                   integer and compacting makes a big difference should
6421     #                   have this.  Hence, the default is to not do this
6422     #                   unless the table's default mapping is to $CODE_POINT,
6423     #                   and the range size is not 1.
6424     main::set_access('to_output_map', \%to_output_map, 's');
6425
6426     sub new {
6427         my $class = shift;
6428         my $name = shift;
6429
6430         my %args = @_;
6431
6432         # Optional initialization data for the table.
6433         my $initialize = delete $args{'Initialize'};
6434
6435         my $default_map = delete $args{'Default_Map'};
6436         my $property = delete $args{'_Property'};
6437         my $full_name = delete $args{'Full_Name'};
6438         my $replacement_property = delete $args{'Replacement_Property'} // "";
6439         my $to_output_map = delete $args{'To_Output_Map'};
6440
6441         # Rest of parameters passed on; legacy properties have several common
6442         # other attributes
6443         if ($replacement_property) {
6444             $args{"Fate"} = $LEGACY_ONLY;
6445             $args{"Range_Size_1"} = 1;
6446             $args{"Perl_Extension"} = 1;
6447             $args{"UCD"} = 0;
6448         }
6449
6450         my $range_list = Range_Map->new(Owner => $property);
6451
6452         my $self = $class->SUPER::new(
6453                                     Name => $name,
6454                                     Complete_Name =>  $full_name,
6455                                     Full_Name => $full_name,
6456                                     _Property => $property,
6457                                     _Range_List => $range_list,
6458                                     Write_As_Invlist => 0,
6459                                     %args);
6460
6461         my $addr = do { no overloading; pack 'J', $self; };
6462
6463         $anomalous_entries{$addr} = [];
6464         $default_map{$addr} = $default_map;
6465         $replacement_property{$addr} = $replacement_property;
6466         $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map
6467                                           && $replacement_property;
6468         $to_output_map{$addr} = $to_output_map;
6469
6470         $self->initialize($initialize) if defined $initialize;
6471
6472         return $self;
6473     }
6474
6475     use overload
6476         fallback => 0,
6477         qw("") => "_operator_stringify",
6478     ;
6479
6480     sub _operator_stringify {
6481         my $self = shift;
6482
6483         my $name = $self->property->full_name;
6484         $name = '""' if $name eq "";
6485         return "Map table for Property '$name'";
6486     }
6487
6488     sub add_alias {
6489         # Add a synonym for this table (which means the property itself)
6490         my $self = shift;
6491         my $name = shift;
6492         # Rest of parameters passed on.
6493
6494         $self->SUPER::add_alias($name, $self->property, @_);
6495         return;
6496     }
6497
6498     sub add_map {
6499         # Add a range of code points to the list of specially-handled code
6500         # points.  $MULTI_CP is assumed if the type of special is not passed
6501         # in.
6502
6503         my $self = shift;
6504         my $lower = shift;
6505         my $upper = shift;
6506         my $string = shift;
6507         my %args = @_;
6508
6509         my $type = delete $args{'Type'} || 0;
6510         # Rest of parameters passed on
6511
6512         # Can't change the table if locked.
6513         return if $self->carp_if_locked;
6514
6515         my $addr = do { no overloading; pack 'J', $self; };
6516
6517         $self->_range_list->add_map($lower, $upper,
6518                                     $string,
6519                                     @_,
6520                                     Type => $type);
6521         return;
6522     }
6523
6524     sub append_to_body {
6525         # Adds to the written HERE document of the table's body any anomalous
6526         # entries in the table..
6527
6528         my $self = shift;
6529         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6530
6531         my $addr = do { no overloading; pack 'J', $self; };
6532
6533         return "" unless @{$anomalous_entries{$addr}};
6534         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6535     }
6536
6537     sub map_add_or_replace_non_nulls {
6538         # This adds the mappings in the table $other to $self.  Non-null
6539         # mappings from $other override those in $self.  It essentially merges
6540         # the two tables, with the second having priority except for null
6541         # mappings.
6542
6543         my $self = shift;
6544         my $other = shift;
6545         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6546
6547         return if $self->carp_if_locked;
6548
6549         if (! $other->isa(__PACKAGE__)) {
6550             Carp::my_carp_bug("$other should be a "
6551                         . __PACKAGE__
6552                         . ".  Not a '"
6553                         . ref($other)
6554                         . "'.  Not added;");
6555             return;
6556         }
6557
6558         my $addr = do { no overloading; pack 'J', $self; };
6559         my $other_addr = do { no overloading; pack 'J', $other; };
6560
6561         local $to_trace = 0 if main::DEBUG;
6562
6563         my $self_range_list = $self->_range_list;
6564         my $other_range_list = $other->_range_list;
6565         foreach my $range ($other_range_list->ranges) {
6566             my $value = $range->value;
6567             next if $value eq "";
6568             $self_range_list->_add_delete('+',
6569                                           $range->start,
6570                                           $range->end,
6571                                           $value,
6572                                           Type => $range->type,
6573                                           Replace => $UNCONDITIONALLY);
6574         }
6575
6576         return;
6577     }
6578
6579     sub set_default_map {
6580         # Define what code points that are missing from the input files should
6581         # map to
6582
6583         my $self = shift;
6584         my $map = shift;
6585         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6586
6587         my $addr = do { no overloading; pack 'J', $self; };
6588
6589         # Convert the input to the standard equivalent, if any (won't have any
6590         # for $STRING properties)
6591         my $standard = $self->_find_table_from_alias->{$map};
6592         $map = $standard->name if defined $standard;
6593
6594         # Warn if there already is a non-equivalent default map for this
6595         # property.  Note that a default map can be a ref, which means that
6596         # what it actually means is delayed until later in the program, and it
6597         # IS permissible to override it here without a message.
6598         my $default_map = $default_map{$addr};
6599         if (defined $default_map
6600             && ! ref($default_map)
6601             && $default_map ne $map
6602             && main::Standardize($map) ne $default_map)
6603         {
6604             my $property = $self->property;
6605             my $map_table = $property->table($map);
6606             my $default_table = $property->table($default_map);
6607             if (defined $map_table
6608                 && defined $default_table
6609                 && $map_table != $default_table)
6610             {
6611                 Carp::my_carp("Changing the default mapping for "
6612                             . $property
6613                             . " from $default_map to $map'");
6614             }
6615         }
6616
6617         $default_map{$addr} = $map;
6618
6619         # Don't also create any missing table for this map at this point,
6620         # because if we did, it could get done before the main table add is
6621         # done for PropValueAliases.txt; instead the caller will have to make
6622         # sure it exists, if desired.
6623         return;
6624     }
6625
6626     sub to_output_map {
6627         # Returns boolean: should we write this map table?
6628
6629         my $self = shift;
6630         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6631
6632         my $addr = do { no overloading; pack 'J', $self; };
6633
6634         # If overridden, use that
6635         return $to_output_map{$addr} if defined $to_output_map{$addr};
6636
6637         my $full_name = $self->full_name;
6638         return $global_to_output_map{$full_name}
6639                                 if defined $global_to_output_map{$full_name};
6640
6641         # If table says to output, do so; if says to suppress it, do so.
6642         my $fate = $self->fate;
6643         return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6644         return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6645         return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6646
6647         my $type = $self->property->type;
6648
6649         # Don't want to output binary map tables even for debugging.
6650         return 0 if $type == $BINARY;
6651
6652         # But do want to output string ones.  All the ones that remain to
6653         # be dealt with (i.e. which haven't explicitly been set to external)
6654         # are for internal Perl use only.  The default for those that map to
6655         # $CODE_POINT and haven't been restricted to a single element range
6656         # is to use the adjusted form.
6657         if ($type == $STRING) {
6658             return $INTERNAL_MAP if $self->range_size_1
6659                                     || $default_map{$addr} ne $CODE_POINT;
6660             return $OUTPUT_ADJUSTED;
6661         }
6662
6663         # Otherwise is an $ENUM, do output it, for Perl's purposes
6664         return $INTERNAL_MAP;
6665     }
6666
6667     sub inverse_list {
6668         # Returns a Range_List that is gaps of the current table.  That is,
6669         # the inversion
6670
6671         my $self = shift;
6672         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6673
6674         my $current = Range_List->new(Initialize => $self->_range_list,
6675                                 Owner => $self->property);
6676         return ~ $current;
6677     }
6678
6679     sub header {
6680         my $self = shift;
6681         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6682
6683         my $return = $self->SUPER::header();
6684
6685         if ($self->to_output_map >= $INTERNAL_MAP) {
6686             $return .= $INTERNAL_ONLY_HEADER;
6687         }
6688         else {
6689             my $property_name = $self->property->replacement_property;
6690
6691             # The legacy-only properties were gotten above; but there are some
6692             # other properties whose files are in current use that have fixed
6693             # formats.
6694             $property_name = $self->property->full_name unless $property_name;
6695
6696             $return .= <<END;
6697
6698 # !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
6699
6700 # This file is for internal use by core Perl only.  It is retained for
6701 # backwards compatibility with applications that may have come to rely on it,
6702 # but its format and even its name or existence are subject to change without
6703 # notice in a future Perl version.  Don't use it directly.  Instead, its
6704 # contents are now retrievable through a stable API in the Unicode::UCD
6705 # module: Unicode::UCD::prop_invmap('$property_name').
6706 END
6707         }
6708         return $return;
6709     }
6710
6711     sub set_final_comment {
6712         # Just before output, create the comment that heads the file
6713         # containing this table.
6714
6715         return unless $debugging_build;
6716
6717         my $self = shift;
6718         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6719
6720         # No sense generating a comment if aren't going to write it out.
6721         return if ! $self->to_output_map;
6722
6723         my $addr = do { no overloading; pack 'J', $self; };
6724
6725         my $property = $self->property;
6726
6727         # Get all the possible names for this property.  Don't use any that
6728         # aren't ok for use in a file name, etc.  This is perhaps causing that
6729         # flag to do double duty, and may have to be changed in the future to
6730         # have our own flag for just this purpose; but it works now to exclude
6731         # Perl generated synonyms from the lists for properties, where the
6732         # name is always the proper Unicode one.
6733         my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6734
6735         my $count = $self->count;
6736         my $default_map = $default_map{$addr};
6737
6738         # The ranges that map to the default aren't output, so subtract that
6739         # to get those actually output.  A property with matching tables
6740         # already has the information calculated.
6741         if ($property->type != $STRING) {
6742             $count -= $property->table($default_map)->count;
6743         }
6744         elsif (defined $default_map) {
6745
6746             # But for $STRING properties, must calculate now.  Subtract the
6747             # count from each range that maps to the default.
6748             foreach my $range ($self->_range_list->ranges) {
6749                 if ($range->value eq $default_map) {
6750                     $count -= $range->end +1 - $range->start;
6751                 }
6752             }
6753
6754         }
6755
6756         # Get a  string version of $count with underscores in large numbers,
6757         # for clarity.
6758         my $string_count = main::clarify_code_point_count($count);
6759
6760         my $code_points = ($count == 1)
6761                         ? 'single code point'
6762                         : "$string_count code points";
6763
6764         my $mapping;
6765         my $these_mappings;
6766         my $are;
6767         if (@property_aliases <= 1) {
6768             $mapping = 'mapping';
6769             $these_mappings = 'this mapping';
6770             $are = 'is'
6771         }
6772         else {
6773             $mapping = 'synonymous mappings';
6774             $these_mappings = 'these mappings';
6775             $are = 'are'
6776         }
6777         my $cp;
6778         if ($count >= $MAX_UNICODE_CODEPOINTS) {
6779             $cp = "any code point in Unicode Version $string_version";
6780         }
6781         else {
6782             my $map_to;
6783             if ($default_map eq "") {
6784                 $map_to = 'the null string';
6785             }
6786             elsif ($default_map eq $CODE_POINT) {
6787                 $map_to = "itself";
6788             }
6789             else {
6790                 $map_to = "'$default_map'";
6791             }
6792             if ($count == 1) {
6793                 $cp = "the single code point";
6794             }
6795             else {
6796                 $cp = "one of the $code_points";
6797             }
6798             $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6799         }
6800
6801         my $comment = "";
6802
6803         my $status = $self->status;
6804         if ($status ne $NORMAL) {
6805             my $warn = uc $status_past_participles{$status};
6806             $comment .= <<END;
6807
6808 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
6809  All property or property=value combinations contained in this file are $warn.
6810  See $unicode_reference_url for what this means.
6811
6812 END
6813         }
6814         $comment .= "This file returns the $mapping:\n";
6815
6816         my $ucd_accessible_name = "";
6817         my $full_name = $self->property->full_name;
6818         for my $i (0 .. @property_aliases - 1) {
6819             my $name = $property_aliases[$i]->name;
6820             $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
6821             if ($property_aliases[$i]->ucd) {
6822                 if ($name eq $full_name) {
6823                     $ucd_accessible_name = $full_name;
6824                 }
6825                 elsif (! $ucd_accessible_name) {
6826                     $ucd_accessible_name = $name;
6827                 }
6828             }
6829         }
6830         $comment .= "\nwhere 'cp' is $cp.";
6831         if ($ucd_accessible_name) {
6832             $comment .= "  Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
6833         }
6834
6835         # And append any commentary already set from the actual property.
6836         $comment .= "\n\n" . $self->comment if $self->comment;
6837         if ($self->description) {
6838             $comment .= "\n\n" . join " ", $self->description;
6839         }
6840         if ($self->note) {
6841             $comment .= "\n\n" . join " ", $self->note;
6842         }
6843         $comment .= "\n";
6844
6845         if (! $self->perl_extension) {
6846             $comment .= <<END;
6847
6848 For information about what this property really means, see:
6849 $unicode_reference_url
6850 END
6851         }
6852
6853         if ($count) {        # Format differs for empty table
6854                 $comment.= "\nThe format of the ";
6855             if ($self->range_size_1) {
6856                 $comment.= <<END;
6857 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6858 is in hex; MAPPING is what CODE_POINT maps to.
6859 END
6860             }
6861             else {
6862
6863                 # There are tables which end up only having one element per
6864                 # range, but it is not worth keeping track of for making just
6865                 # this comment a little better.
6866                 $comment.= <<END;
6867 non-comment portions of the main body of lines of this file is:
6868 START\\tSTOP\\tMAPPING where START is the starting code point of the
6869 range, in hex; STOP is the ending point, or if omitted, the range has just one
6870 code point; MAPPING is what each code point between START and STOP maps to.
6871 END
6872                 if ($self->output_range_counts) {
6873                     $comment .= <<END;
6874 Numbers in comments in [brackets] indicate how many code points are in the
6875 range (omitted when the range is a single code point or if the mapping is to
6876 the null string).
6877 END
6878                 }
6879             }
6880         }
6881         $self->set_comment(main::join_lines($comment));
6882         return;
6883     }
6884
6885     my %swash_keys; # Makes sure don't duplicate swash names.
6886
6887     # The remaining variables are temporaries used while writing each table,
6888     # to output special ranges.
6889     my @multi_code_point_maps;  # Map is to more than one code point.
6890
6891     sub handle_special_range {
6892         # Called in the middle of write when it finds a range it doesn't know
6893         # how to handle.
6894
6895         my $self = shift;
6896         my $range = shift;
6897         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6898
6899         my $addr = do { no overloading; pack 'J', $self; };
6900
6901         my $type = $range->type;
6902
6903         my $low = $range->start;
6904         my $high = $range->end;
6905         my $map = $range->value;
6906
6907         # No need to output the range if it maps to the default.
6908         return if $map eq $default_map{$addr};
6909
6910         my $property = $self->property;
6911
6912         # Switch based on the map type...
6913         if ($type == $HANGUL_SYLLABLE) {
6914
6915             # These are entirely algorithmically determinable based on
6916             # some constants furnished by Unicode; for now, just set a
6917             # flag to indicate that have them.  After everything is figured
6918             # out, we will output the code that does the algorithm.  (Don't
6919             # output them if not needed because we are suppressing this
6920             # property.)
6921             $has_hangul_syllables = 1 if $property->to_output_map;
6922         }
6923         elsif ($type == $CP_IN_NAME) {
6924
6925             # Code points whose name ends in their code point are also
6926             # algorithmically determinable, but need information about the map
6927             # to do so.  Both the map and its inverse are stored in data
6928             # structures output in the file.  They are stored in the mean time
6929             # in global lists The lists will be written out later into Name.pm,
6930             # which is created only if needed.  In order to prevent duplicates
6931             # in the list, only add to them for one property, should multiple
6932             # ones need them.
6933             if ($needing_code_points_ending_in_code_point == 0) {
6934                 $needing_code_points_ending_in_code_point = $property;
6935             }
6936             if ($property == $needing_code_points_ending_in_code_point) {
6937                 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6938                 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6939
6940                 my $squeezed = $map =~ s/[-\s]+//gr;
6941                 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6942                                                                           $low;
6943                 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6944                                                                          $high;
6945
6946                 push @code_points_ending_in_code_point, { low => $low,
6947                                                         high => $high,
6948                                                         name => $map
6949                                                         };
6950             }
6951         }
6952         elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6953
6954             # Multi-code point maps and null string maps have an entry
6955             # for each code point in the range.  They use the same
6956             # output format.
6957             for my $code_point ($low .. $high) {
6958
6959                 # The pack() below can't cope with surrogates.  XXX This may
6960                 # no longer be true
6961                 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
6962                     Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
6963                     next;
6964                 }
6965
6966                 # Generate the hash entries for these in the form that
6967                 # utf8.c understands.
6968                 my $tostr = "";
6969                 my $to_name = "";
6970                 my $to_chr = "";
6971                 foreach my $to (split " ", $map) {
6972                     if ($to !~ /^$code_point_re$/) {
6973                         Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
6974                         next;
6975                     }
6976                     $tostr .= sprintf "\\x{%s}", $to;
6977                     $to = CORE::hex $to;
6978                     if ($annotate) {
6979                         $to_name .= " + " if $to_name;
6980                         $to_chr .= main::display_chr($to);
6981                         main::populate_char_info($to)
6982                                             if ! defined $viacode[$to];
6983                         $to_name .=  $viacode[$to];
6984                     }
6985                 }
6986
6987                 # I (khw) have never waded through this line to
6988                 # understand it well enough to comment it.
6989                 my $utf8 = sprintf(qq["%s" => "$tostr",],
6990                         join("", map { sprintf "\\x%02X", $_ }
6991                             unpack("U0C*", pack("U", $code_point))));
6992
6993                 # Add a comment so that a human reader can more easily
6994                 # see what's going on.
6995                 push @multi_code_point_maps,
6996                         sprintf("%-45s # U+%04X", $utf8, $code_point);
6997                 if (! $annotate) {
6998                     $multi_code_point_maps[-1] .= " => $map";
6999                 }
7000                 else {
7001                     main::populate_char_info($code_point)
7002                                     if ! defined $viacode[$code_point];
7003                     $multi_code_point_maps[-1] .= " '"
7004                         . main::display_chr($code_point)
7005                         . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7006                 }
7007             }
7008         }
7009         else {
7010             Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
7011         }
7012
7013         return;
7014     }
7015
7016     sub pre_body {
7017         # Returns the string that should be output in the file before the main
7018         # body of this table.  It isn't called until the main body is
7019         # calculated, saving a pass.  The string includes some hash entries
7020         # identifying the format of the body, and what the single value should
7021         # be for all ranges missing from it.  It also includes any code points
7022         # which have map_types that don't go in the main table.
7023
7024         my $self = shift;
7025         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7026
7027         my $addr = do { no overloading; pack 'J', $self; };
7028
7029         my $name = $self->property->swash_name;
7030
7031         # Currently there is nothing in the pre_body unless a swash is being
7032         # generated.
7033         return unless defined $name;
7034
7035         if (defined $swash_keys{$name}) {
7036             Carp::my_carp(main::join_lines(<<END
7037 Already created a swash name '$name' for $swash_keys{$name}.  This means that
7038 the same name desired for $self shouldn't be used.  Bad News.  This must be
7039 fixed before production use, but proceeding anyway
7040 END
7041             ));
7042         }
7043         $swash_keys{$name} = "$self";
7044
7045         my $pre_body = "";
7046
7047         # Here we assume we were called after have gone through the whole
7048         # file.  If we actually generated anything for each map type, add its
7049         # respective header and trailer
7050         my $specials_name = "";
7051         if (@multi_code_point_maps) {
7052             $specials_name = "utf8::ToSpec$name";
7053             $pre_body .= <<END;
7054
7055 # Some code points require special handling because their mappings are each to
7056 # multiple code points.  These do not appear in the main body, but are defined
7057 # in the hash below.
7058
7059 # Each key is the string of N bytes that together make up the UTF-8 encoding
7060 # for the code point.  (i.e. the same as looking at the code point's UTF-8
7061 # under "use bytes").  Each value is the UTF-8 of the translation, for speed.
7062 \%$specials_name = (
7063 END
7064             $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7065         }
7066
7067         my $format = $self->format;
7068
7069         my $return = "";
7070
7071         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7072         if ($output_adjusted) {
7073             if ($specials_name) {
7074                 $return .= <<END;
7075 # The mappings in the non-hash portion of this file must be modified to get the
7076 # correct values by adding the code point ordinal number to each one that is
7077 # numeric.
7078 END
7079             }
7080             else {
7081                 $return .= <<END;
7082 # The mappings must be modified to get the correct values by adding the code
7083 # point ordinal number to each one that is numeric.
7084 END
7085             }
7086         }
7087
7088         $return .= <<END;
7089
7090 # The name this swash is to be known by, with the format of the mappings in
7091 # the main body of the table, and what all code points missing from this file
7092 # map to.
7093 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7094 END
7095         if ($specials_name) {
7096             $return .= <<END;
7097 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7098 END
7099         }
7100         my $default_map = $default_map{$addr};
7101
7102         # For $CODE_POINT default maps and using adjustments, instead the default
7103         # becomes zero.
7104         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
7105                 .  (($output_adjusted && $default_map eq $CODE_POINT)
7106                    ? "0"
7107                    : $default_map)
7108                 . "';";
7109
7110         if ($default_map eq $CODE_POINT) {
7111             $return .= ' # code point maps to itself';
7112         }
7113         elsif ($default_map eq "") {
7114             $return .= ' # code point maps to the null string';
7115         }
7116         $return .= "\n";
7117
7118         $return .= $pre_body;
7119
7120         return $return;
7121     }
7122
7123     sub write {
7124         # Write the table to the file.
7125
7126         my $self = shift;
7127         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7128
7129         my $addr = do { no overloading; pack 'J', $self; };
7130
7131         # Clear the temporaries
7132         undef @multi_code_point_maps;
7133
7134         # Calculate the format of the table if not already done.
7135         my $format = $self->format;
7136         my $type = $self->property->type;
7137         my $default_map = $self->default_map;
7138         if (! defined $format) {
7139             if ($type == $BINARY) {
7140
7141                 # Don't bother checking the values, because we elsewhere
7142                 # verify that a binary table has only 2 values.
7143                 $format = $BINARY_FORMAT;
7144             }
7145             else {
7146                 my @ranges = $self->_range_list->ranges;
7147
7148                 # default an empty table based on its type and default map
7149                 if (! @ranges) {
7150
7151                     # But it turns out that the only one we can say is a
7152                     # non-string (besides binary, handled above) is when the
7153                     # table is a string and the default map is to a code point
7154                     if ($type == $STRING && $default_map eq $CODE_POINT) {
7155                         $format = $HEX_FORMAT;
7156                     }
7157                     else {
7158                         $format = $STRING_FORMAT;
7159                     }
7160                 }
7161                 else {
7162
7163                     # Start with the most restrictive format, and as we find
7164                     # something that doesn't fit with that, change to the next
7165                     # most restrictive, and so on.
7166                     $format = $DECIMAL_FORMAT;
7167                     foreach my $range (@ranges) {
7168                         next if $range->type != 0;  # Non-normal ranges don't
7169                                                     # affect the main body
7170                         my $map = $range->value;
7171                         if ($map ne $default_map) {
7172                             last if $format eq $STRING_FORMAT;  # already at
7173                                                                 # least
7174                                                                 # restrictive
7175                             $format = $INTEGER_FORMAT
7176                                                 if $format eq $DECIMAL_FORMAT
7177                                                     && $map !~ / ^ [0-9] $ /x;
7178                             $format = $FLOAT_FORMAT
7179                                             if $format eq $INTEGER_FORMAT
7180                                                 && $map !~ / ^ -? [0-9]+ $ /x;
7181                             $format = $RATIONAL_FORMAT
7182                                 if $format eq $FLOAT_FORMAT
7183                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7184                             $format = $HEX_FORMAT
7185                                 if ($format eq $RATIONAL_FORMAT
7186                                        && $map !~
7187                                            m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7188                                         # Assume a leading zero means hex,
7189                                         # even if all digits are 0-9
7190                                     || ($format eq $INTEGER_FORMAT
7191                                         && $map =~ /^0[0-9A-F]/);
7192                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7193                                                        && $map =~ /[^0-9A-F]/;
7194                         }
7195                     }
7196                 }
7197             }
7198         } # end of calculating format
7199
7200         if ($default_map eq $CODE_POINT
7201             && $format ne $HEX_FORMAT
7202             && ! defined $self->format)    # manual settings are always
7203                                            # considered ok
7204         {
7205             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7206         }
7207
7208         # If the output is to be adjusted, the format of the table that gets
7209         # output is actually 'a' or 'ax' instead of whatever it is stored
7210         # internally as.
7211         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7212         if ($output_adjusted) {
7213             if ($default_map eq $CODE_POINT) {
7214                 $format = $HEX_ADJUST_FORMAT;
7215             }
7216             else {
7217                 $format = $ADJUST_FORMAT;
7218             }
7219         }
7220
7221         $self->_set_format($format);
7222
7223         return $self->SUPER::write(
7224             $output_adjusted,
7225             $default_map);   # don't write defaulteds
7226     }
7227
7228     # Accessors for the underlying list that should fail if locked.
7229     for my $sub (qw(
7230                     add_duplicate
7231                 ))
7232     {
7233         no strict "refs";
7234         *$sub = sub {
7235             use strict "refs";
7236             my $self = shift;
7237
7238             return if $self->carp_if_locked;
7239             return $self->_range_list->$sub(@_);
7240         }
7241     }
7242 } # End closure for Map_Table
7243
7244 package Match_Table;
7245 use parent '-norequire', '_Base_Table';
7246
7247 # A Match table is one which is a list of all the code points that have
7248 # the same property and property value, for use in \p{property=value}
7249 # constructs in regular expressions.  It adds very little data to the base
7250 # structure, but many methods, as these lists can be combined in many ways to
7251 # form new ones.
7252 # There are only a few concepts added:
7253 # 1) Equivalents and Relatedness.
7254 #    Two tables can match the identical code points, but have different names.
7255 #    This always happens when there is a perl single form extension
7256 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
7257 #    tables are set to be related, with the Perl extension being a child, and
7258 #    the Unicode property being the parent.
7259 #
7260 #    It may be that two tables match the identical code points and we don't
7261 #    know if they are related or not.  This happens most frequently when the
7262 #    Block and Script properties have the exact range.  But note that a
7263 #    revision to Unicode could add new code points to the script, which would
7264 #    now have to be in a different block (as the block was filled, or there
7265 #    would have been 'Unknown' script code points in it and they wouldn't have
7266 #    been identical).  So we can't rely on any two properties from Unicode
7267 #    always matching the same code points from release to release, and thus
7268 #    these tables are considered coincidentally equivalent--not related.  When
7269 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
7270 #    'leader', and the others are 'equivalents'.  This concept is useful
7271 #    to minimize the number of tables written out.  Only one file is used for
7272 #    any identical set of code points, with entries in Heavy.pl mapping all
7273 #    the involved tables to it.
7274 #
7275 #    Related tables will always be identical; we set them up to be so.  Thus
7276 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
7277 #    unrelated tables.  Relatedness makes generating the documentation easier.
7278 #
7279 # 2) Complement.
7280 #    Like equivalents, two tables may be the inverses of each other, the
7281 #    intersection between them is null, and the union is every Unicode code
7282 #    point.  The two tables that occupy a binary property are necessarily like
7283 #    this.  By specifying one table as the complement of another, we can avoid
7284 #    storing it on disk (using the other table and performing a fast
7285 #    transform), and some memory and calculations.
7286 #
7287 # 3) Conflicting.  It may be that there will eventually be name clashes, with
7288 #    the same name meaning different things.  For a while, there actually were
7289 #    conflicts, but they have so far been resolved by changing Perl's or
7290 #    Unicode's definitions to match the other, but when this code was written,
7291 #    it wasn't clear that that was what was going to happen.  (Unicode changed
7292 #    because of protests during their beta period.)  Name clashes are warned
7293 #    about during compilation, and the documentation.  The generated tables
7294 #    are sane, free of name clashes, because the code suppresses the Perl
7295 #    version.  But manual intervention to decide what the actual behavior
7296 #    should be may be required should this happen.  The introductory comments
7297 #    have more to say about this.
7298
7299 sub standardize { return main::standardize($_[0]); }
7300 sub trace { return main::trace(@_); }
7301
7302
7303 { # Closure
7304
7305     main::setup_package();
7306
7307     my %leader;
7308     # The leader table of this one; initially $self.
7309     main::set_access('leader', \%leader, 'r');
7310
7311     my %equivalents;
7312     # An array of any tables that have this one as their leader
7313     main::set_access('equivalents', \%equivalents, 'readable_array');
7314
7315     my %parent;
7316     # The parent table to this one, initially $self.  This allows us to
7317     # distinguish between equivalent tables that are related (for which this
7318     # is set to), and those which may not be, but share the same output file
7319     # because they match the exact same set of code points in the current
7320     # Unicode release.
7321     main::set_access('parent', \%parent, 'r');
7322
7323     my %children;
7324     # An array of any tables that have this one as their parent
7325     main::set_access('children', \%children, 'readable_array');
7326
7327     my %conflicting;
7328     # Array of any tables that would have the same name as this one with
7329     # a different meaning.  This is used for the generated documentation.
7330     main::set_access('conflicting', \%conflicting, 'readable_array');
7331
7332     my %matches_all;
7333     # Set in the constructor for tables that are expected to match all code
7334     # points.
7335     main::set_access('matches_all', \%matches_all, 'r');
7336
7337     my %complement;
7338     # Points to the complement that this table is expressed in terms of; 0 if
7339     # none.
7340     main::set_access('complement', \%complement, 'r');
7341
7342     sub new {
7343         my $class = shift;
7344
7345         my %args = @_;
7346
7347         # The property for which this table is a listing of property values.
7348         my $property = delete $args{'_Property'};
7349
7350         my $name = delete $args{'Name'};
7351         my $full_name = delete $args{'Full_Name'};
7352         $full_name = $name if ! defined $full_name;
7353
7354         # Optional
7355         my $initialize = delete $args{'Initialize'};
7356         my $matches_all = delete $args{'Matches_All'} || 0;
7357         my $format = delete $args{'Format'};
7358         # Rest of parameters passed on.
7359
7360         my $range_list = Range_List->new(Initialize => $initialize,
7361                                          Owner => $property);
7362
7363         my $complete = $full_name;
7364         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
7365                                               # but this helps debug if it
7366                                               # does
7367         # The complete name for a match table includes it's property in a
7368         # compound form 'property=table', except if the property is the
7369         # pseudo-property, perl, in which case it is just the single form,
7370         # 'table' (If you change the '=' must also change the ':' in lots of
7371         # places in this program that assume an equal sign)
7372         $complete = $property->full_name . "=$complete" if $property != $perl;
7373
7374         my $self = $class->SUPER::new(%args,
7375                                       Name => $name,
7376                                       Complete_Name => $complete,
7377                                       Full_Name => $full_name,
7378                                       _Property => $property,
7379                                       _Range_List => $range_list,
7380                                       Format => $EMPTY_FORMAT,
7381                                       Write_As_Invlist => 1,
7382                                       );
7383         my $addr = do { no overloading; pack 'J', $self; };
7384
7385         $conflicting{$addr} = [ ];
7386         $equivalents{$addr} = [ ];
7387         $children{$addr} = [ ];
7388         $matches_all{$addr} = $matches_all;
7389         $leader{$addr} = $self;
7390         $parent{$addr} = $self;
7391         $complement{$addr} = 0;
7392
7393         if (defined $format && $format ne $EMPTY_FORMAT) {
7394             Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
7395         }
7396
7397         return $self;
7398     }
7399
7400     # See this program's beginning comment block about overloading these.
7401     use overload
7402         fallback => 0,
7403         qw("") => "_operator_stringify",
7404         '=' => sub {
7405                     my $self = shift;
7406
7407                     return if $self->carp_if_locked;
7408                     return $self;
7409                 },
7410
7411         '+' => sub {
7412                         my $self = shift;
7413                         my $other = shift;
7414
7415                         return $self->_range_list + $other;
7416                     },
7417         '&' => sub {
7418                         my $self = shift;
7419                         my $other = shift;
7420
7421                         return $self->_range_list & $other;
7422                     },
7423         '+=' => sub {
7424                         my $self = shift;
7425                         my $other = shift;
7426                         my $reversed = shift;
7427
7428                         if ($reversed) {
7429                             Carp::my_carp_bug("Bad news.  Can't cope with '"
7430                             . ref($other)
7431                             . ' += '
7432                             . ref($self)
7433                             . "'.  undef returned.");
7434                             return;
7435                         }
7436
7437                         return if $self->carp_if_locked;
7438
7439                         my $addr = do { no overloading; pack 'J', $self; };
7440
7441                         if (ref $other) {
7442
7443                             # Change the range list of this table to be the
7444                             # union of the two.
7445                             $self->_set_range_list($self->_range_list
7446                                                     + $other);
7447                         }
7448                         else {    # $other is just a simple value
7449                             $self->add_range($other, $other);
7450                         }
7451                         return $self;
7452                     },
7453         '&=' => sub {
7454                         my $self = shift;
7455                         my $other = shift;
7456                         my $reversed = shift;
7457
7458                         if ($reversed) {
7459                             Carp::my_carp_bug("Bad news.  Can't cope with '"
7460                             . ref($other)
7461                             . ' &= '
7462                             . ref($self)
7463                             . "'.  undef returned.");
7464                             return;
7465                         }
7466
7467                         return if $self->carp_if_locked;
7468                         $self->_set_range_list($self->_range_list & $other);
7469                         return $self;
7470                     },
7471         '-' => sub { my $self = shift;
7472                     my $other = shift;
7473                     my $reversed = shift;
7474                     if ($reversed) {
7475                         Carp::my_carp_bug("Bad news.  Can't cope with '"
7476                         . ref($other)
7477                         . ' - '
7478                         . ref($self)
7479                         . "'.  undef returned.");
7480                         return;
7481                     }
7482
7483                     return $self->_range_list - $other;
7484                 },
7485         '~' => sub { my $self = shift;
7486                     return ~ $self->_range_list;
7487                 },
7488     ;
7489
7490     sub _operator_stringify {
7491         my $self = shift;
7492
7493         my $name = $self->complete_name;
7494         return "Table '$name'";
7495     }
7496
7497     sub _range_list {
7498         # Returns the range list associated with this table, which will be the
7499         # complement's if it has one.
7500
7501         my $self = shift;
7502         my $complement;
7503         if (($complement = $self->complement) != 0) {
7504             return ~ $complement->_range_list;
7505         }
7506         else {
7507             return $self->SUPER::_range_list;
7508         }
7509     }
7510
7511     sub add_alias {
7512         # Add a synonym for this table.  See the comments in the base class
7513
7514         my $self = shift;
7515         my $name = shift;
7516         # Rest of parameters passed on.
7517
7518         $self->SUPER::add_alias($name, $self, @_);
7519         return;
7520     }
7521
7522     sub add_conflicting {
7523         # Add the name of some other object to the list of ones that name
7524         # clash with this match table.
7525
7526         my $self = shift;
7527         my $conflicting_name = shift;   # The name of the conflicting object
7528         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
7529         my $conflicting_object = shift; # Optional, the conflicting object
7530                                         # itself.  This is used to
7531                                         # disambiguate the text if the input
7532                                         # name is identical to any of the
7533                                         # aliases $self is known by.
7534                                         # Sometimes the conflicting object is
7535                                         # merely hypothetical, so this has to
7536                                         # be an optional parameter.
7537         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7538
7539         my $addr = do { no overloading; pack 'J', $self; };
7540
7541         # Check if the conflicting name is exactly the same as any existing
7542         # alias in this table (as long as there is a real object there to
7543         # disambiguate with).
7544         if (defined $conflicting_object) {
7545             foreach my $alias ($self->aliases) {
7546                 if ($alias->name eq $conflicting_name) {
7547
7548                     # Here, there is an exact match.  This results in
7549                     # ambiguous comments, so disambiguate by changing the
7550                     # conflicting name to its object's complete equivalent.
7551                     $conflicting_name = $conflicting_object->complete_name;
7552                     last;
7553                 }
7554             }
7555         }
7556
7557         # Convert to the \p{...} final name
7558         $conflicting_name = "\\$p" . "{$conflicting_name}";
7559
7560         # Only add once
7561         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7562
7563         push @{$conflicting{$addr}}, $conflicting_name;
7564
7565         return;
7566     }
7567
7568     sub is_set_equivalent_to {
7569         # Return boolean of whether or not the other object is a table of this
7570         # type and has been marked equivalent to this one.
7571
7572         my $self = shift;
7573         my $other = shift;
7574         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7575
7576         return 0 if ! defined $other; # Can happen for incomplete early
7577                                       # releases
7578         unless ($other->isa(__PACKAGE__)) {
7579             my $ref_other = ref $other;
7580             my $ref_self = ref $self;
7581             Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
7582             return 0;
7583         }
7584
7585         # Two tables are equivalent if they have the same leader.
7586         no overloading;
7587         return $leader{pack 'J', $self} == $leader{pack 'J', $other};
7588         return;
7589     }
7590
7591     sub set_equivalent_to {
7592         # Set $self equivalent to the parameter table.
7593         # The required Related => 'x' parameter is a boolean indicating
7594         # whether these tables are related or not.  If related, $other becomes
7595         # the 'parent' of $self; if unrelated it becomes the 'leader'
7596         #
7597         # Related tables share all characteristics except names; equivalents
7598         # not quite so many.
7599         # If they are related, one must be a perl extension.  This is because
7600         # we can't guarantee that Unicode won't change one or the other in a
7601         # later release even if they are identical now.
7602
7603         my $self = shift;
7604         my $other = shift;
7605
7606         my %args = @_;
7607         my $related = delete $args{'Related'};
7608
7609         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7610
7611         return if ! defined $other;     # Keep on going; happens in some early
7612                                         # Unicode releases.
7613
7614         if (! defined $related) {
7615             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
7616             $related = 0;
7617         }
7618
7619         # If already are equivalent, no need to re-do it;  if subroutine
7620         # returns null, it found an error, also do nothing
7621         my $are_equivalent = $self->is_set_equivalent_to($other);
7622         return if ! defined $are_equivalent || $are_equivalent;
7623
7624         my $addr = do { no overloading; pack 'J', $self; };
7625         my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7626
7627         if ($related) {
7628             if ($current_leader->perl_extension) {
7629                 if ($other->perl_extension) {
7630                     Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7631                     return;
7632                 }
7633             } elsif ($self->property != $other->property    # Depending on
7634                                                             # situation, might
7635                                                             # be better to use
7636                                                             # add_alias()
7637                                                             # instead for same
7638                                                             # property
7639                      && ! $other->perl_extension)
7640             {
7641                 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
7642                 $related = 0;
7643             }
7644         }
7645
7646         if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7647             Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
7648             return;
7649         }
7650
7651         my $leader = do { no overloading; pack 'J', $current_leader; };
7652         my $other_addr = do { no overloading; pack 'J', $other; };
7653
7654         # Any tables that are equivalent to or children of this table must now
7655         # instead be equivalent to or (children) to the new leader (parent),
7656         # still equivalent.  The equivalency includes their matches_all info,
7657         # and for related tables, their fate and status.
7658         # All related tables are of necessity equivalent, but the converse
7659         # isn't necessarily true
7660         my $status = $other->status;
7661         my $status_info = $other->status_info;
7662         my $fate = $other->fate;
7663         my $matches_all = $matches_all{other_addr};
7664         my $caseless_equivalent = $other->caseless_equivalent;
7665         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7666             next if $table == $other;
7667             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7668
7669             my $table_addr = do { no overloading; pack 'J', $table; };
7670             $leader{$table_addr} = $other;
7671             $matches_all{$table_addr} = $matches_all;
7672             $self->_set_range_list($other->_range_list);
7673             push @{$equivalents{$other_addr}}, $table;
7674             if ($related) {
7675                 $parent{$table_addr} = $other;
7676                 push @{$children{$other_addr}}, $table;
7677                 $table->set_status($status, $status_info);
7678
7679                 # This reason currently doesn't get exposed outside; otherwise
7680                 # would have to look up the parent's reason and use it instead.
7681                 $table->set_fate($fate, "Parent's fate");
7682
7683                 $self->set_caseless_equivalent($caseless_equivalent);
7684             }
7685         }
7686
7687         # Now that we've declared these to be equivalent, any changes to one
7688         # of the tables would invalidate that equivalency.
7689         $self->lock;
7690         $other->lock;
7691         return;
7692     }
7693
7694     sub set_complement {
7695         # Set $self to be the complement of the parameter table.  $self is
7696         # locked, as what it contains should all come from the other table.
7697
7698         my $self = shift;
7699         my $other = shift;
7700
7701         my %args = @_;
7702         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7703
7704         if ($other->complement != 0) {
7705             Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7706             return;
7707         }
7708         my $addr = do { no overloading; pack 'J', $self; };
7709         $complement{$addr} = $other;
7710         $self->lock;
7711         return;
7712     }
7713
7714     sub add_range { # Add a range to the list for this table.
7715         my $self = shift;
7716         # Rest of parameters passed on
7717
7718         return if $self->carp_if_locked;
7719         return $self->_range_list->add_range(@_);
7720     }
7721
7722     sub header {
7723         my $self = shift;
7724         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7725
7726         # All match tables are to be used only by the Perl core.
7727         return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
7728     }
7729
7730     sub pre_body {  # Does nothing for match tables.
7731         return
7732     }
7733
7734     sub append_to_body {  # Does nothing for match tables.
7735         return
7736     }
7737
7738     sub set_fate {
7739         my $self = shift;
7740         my $fate = shift;
7741         my $reason = shift;
7742         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7743
7744         $self->SUPER::set_fate($fate, $reason);
7745
7746         # All children share this fate
7747         foreach my $child ($self->children) {
7748             $child->set_fate($fate, $reason);
7749         }
7750         return;
7751     }
7752
7753     sub write {
7754         my $self = shift;
7755         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7756
7757         return $self->SUPER::write(0); # No adjustments
7758     }
7759
7760     sub set_final_comment {
7761         # This creates a comment for the file that is to hold the match table
7762         # $self.  It is somewhat convoluted to make the English read nicely,
7763         # but, heh, it's just a comment.
7764         # This should be called only with the leader match table of all the
7765         # ones that share the same file.  It lists all such tables, ordered so
7766         # that related ones are together.
7767
7768         return unless $debugging_build;
7769
7770         my $leader = shift;   # Should only be called on the leader table of
7771                               # an equivalent group
7772         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7773
7774         my $addr = do { no overloading; pack 'J', $leader; };
7775
7776         if ($leader{$addr} != $leader) {
7777             Carp::my_carp_bug(<<END
7778 set_final_comment() must be called on a leader table, which $leader is not.
7779 It is equivalent to $leader{$addr}.  No comment created
7780 END
7781             );
7782             return;
7783         }
7784
7785         # Get the number of code points matched by each of the tables in this
7786         # file, and add underscores for clarity.
7787         my $count = $leader->count;
7788         my $unicode_count;
7789         my $non_unicode_string;
7790         if ($count > $MAX_UNICODE_CODEPOINTS) {
7791             $unicode_count = $count - ($MAX_WORKING_CODEPOINT
7792                                        - $MAX_UNICODE_CODEPOINT);
7793             $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
7794         }
7795         else {
7796             $unicode_count = $count;
7797             $non_unicode_string = "";
7798         }
7799         my $string_count = main::clarify_code_point_count($unicode_count);
7800
7801         my $loose_count = 0;        # how many aliases loosely matched
7802         my $compound_name = "";     # ? Are any names compound?, and if so, an
7803                                     # example
7804         my $properties_with_compound_names = 0;    # count of these
7805
7806
7807         my %flags;              # The status flags used in the file
7808         my $total_entries = 0;  # number of entries written in the comment
7809         my $matches_comment = ""; # The portion of the comment about the
7810                                   # \p{}'s
7811         my @global_comments;    # List of all the tables' comments that are
7812                                 # there before this routine was called.
7813         my $has_ucd_alias = 0;  # If there is an alias that is accessible via
7814                                 # Unicode::UCD.  If not, then don't say it is
7815                                 # in the comment
7816
7817         # Get list of all the parent tables that are equivalent to this one
7818         # (including itself).
7819         my @parents = grep { $parent{main::objaddr $_} == $_ }
7820                             main::uniques($leader, @{$equivalents{$addr}});
7821         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
7822                                               # tables
7823
7824         for my $parent (@parents) {
7825
7826             my $property = $parent->property;
7827
7828             # Special case 'N' tables in properties with two match tables when
7829             # the other is a 'Y' one.  These are likely to be binary tables,
7830             # but not necessarily.  In either case, \P{} will match the
7831             # complement of \p{}, and so if something is a synonym of \p, the
7832             # complement of that something will be the synonym of \P.  This
7833             # would be true of any property with just two match tables, not
7834             # just those whose values are Y and N; but that would require a
7835             # little extra work, and there are none such so far in Unicode.
7836             my $perl_p = 'p';        # which is it?  \p{} or \P{}
7837             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
7838
7839             if (scalar $property->tables == 2
7840                 && $parent == $property->table('N')
7841                 && defined (my $yes = $property->table('Y')))
7842             {
7843                 my $yes_addr = do { no overloading; pack 'J', $yes; };
7844                 @yes_perl_synonyms
7845                     = grep { $_->property == $perl }
7846                                     main::uniques($yes,
7847                                                 $parent{$yes_addr},
7848                                                 $parent{$yes_addr}->children);
7849
7850                 # But these synonyms are \P{} ,not \p{}
7851                 $perl_p = 'P';
7852             }
7853
7854             my @description;        # Will hold the table description
7855             my @note;               # Will hold the table notes.
7856             my @conflicting;        # Will hold the table conflicts.
7857
7858             # Look at the parent, any yes synonyms, and all the children
7859             my $parent_addr = do { no overloading; pack 'J', $parent; };
7860             for my $table ($parent,
7861                            @yes_perl_synonyms,
7862                            @{$children{$parent_addr}})
7863             {
7864                 my $table_addr = do { no overloading; pack 'J', $table; };
7865                 my $table_property = $table->property;
7866
7867                 # Tables are separated by a blank line to create a grouping.
7868                 $matches_comment .= "\n" if $matches_comment;
7869
7870                 # The table is named based on the property and value
7871                 # combination it is for, like script=greek.  But there may be
7872                 # a number of synonyms for each side, like 'sc' for 'script',
7873                 # and 'grek' for 'greek'.  Any combination of these is a valid
7874                 # name for this table.  In this case, there are three more,
7875                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
7876                 # listing all possible combinations in the comment, we make
7877                 # sure that each synonym occurs at least once, and add
7878                 # commentary that the other combinations are possible.
7879                 # Because regular expressions don't recognize things like
7880                 # \p{jsn=}, only look at non-null right-hand-sides
7881                 my @property_aliases = $table_property->aliases;
7882                 my @table_aliases = grep { $_->name ne "" } $table->aliases;
7883
7884                 # The alias lists above are already ordered in the order we
7885                 # want to output them.  To ensure that each synonym is listed,
7886                 # we must use the max of the two numbers.  But if there are no
7887                 # legal synonyms (nothing in @table_aliases), then we don't
7888                 # list anything.
7889                 my $listed_combos = (@table_aliases)
7890                                     ?  main::max(scalar @table_aliases,
7891                                                  scalar @property_aliases)
7892                                     : 0;
7893                 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7894
7895
7896                 my $property_had_compound_name = 0;
7897
7898                 for my $i (0 .. $listed_combos - 1) {
7899                     $total_entries++;
7900
7901                     # The current alias for the property is the next one on
7902                     # the list, or if beyond the end, start over.  Similarly
7903                     # for the table (\p{prop=table})
7904                     my $property_alias = $property_aliases
7905                                             [$i % @property_aliases]->name;
7906                     my $table_alias_object = $table_aliases
7907                                                         [$i % @table_aliases];
7908                     my $table_alias = $table_alias_object->name;
7909                     my $loose_match = $table_alias_object->loose_match;
7910                     $has_ucd_alias |= $table_alias_object->ucd;
7911
7912                     if ($table_alias !~ /\D/) { # Clarify large numbers.
7913                         $table_alias = main::clarify_number($table_alias)
7914                     }
7915
7916                     # Add a comment for this alias combination
7917                     my $current_match_comment;
7918                     if ($table_property == $perl) {
7919                         $current_match_comment = "\\$perl_p"
7920                                                     . "{$table_alias}";
7921                     }
7922                     else {
7923                         $current_match_comment
7924                                         = "\\p{$property_alias=$table_alias}";
7925                         $property_had_compound_name = 1;
7926                     }
7927
7928                     # Flag any abnormal status for this table.
7929                     my $flag = $property->status
7930                                 || $table->status
7931                                 || $table_alias_object->status;
7932                     if ($flag && $flag ne $PLACEHOLDER) {
7933                         $flags{$flag} = $status_past_participles{$flag};
7934                     }
7935
7936                     $loose_count++;
7937
7938                     # Pretty up the comment.  Note the \b; it says don't make
7939                     # this line a continuation.
7940                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
7941                                         $flag,
7942                                         " " x 7,
7943                                         $current_match_comment);
7944                 } # End of generating the entries for this table.
7945
7946                 # Save these for output after this group of related tables.
7947                 push @description, $table->description;
7948                 push @note, $table->note;
7949                 push @conflicting, $table->conflicting;
7950
7951                 # And this for output after all the tables.
7952                 push @global_comments, $table->comment;
7953
7954                 # Compute an alternate compound name using the final property
7955                 # synonym and the first table synonym with a colon instead of
7956                 # the equal sign used elsewhere.
7957                 if ($property_had_compound_name) {
7958                     $properties_with_compound_names ++;
7959                     if (! $compound_name || @property_aliases > 1) {
7960                         $compound_name = $property_aliases[-1]->name
7961                                         . ': '
7962                                         . $table_aliases[0]->name;
7963                     }
7964                 }
7965             } # End of looping through all children of this table
7966
7967             # Here have assembled in $matches_comment all the related tables
7968             # to the current parent (preceded by the same info for all the
7969             # previous parents).  Put out information that applies to all of
7970             # the current family.
7971             if (@conflicting) {
7972
7973                 # But output the conflicting information now, as it applies to
7974                 # just this table.
7975                 my $conflicting = join ", ", @conflicting;
7976                 if ($conflicting) {
7977                     $matches_comment .= <<END;
7978
7979     Note that contrary to what you might expect, the above is NOT the same as
7980 END
7981                     $matches_comment .= "any of: " if @conflicting > 1;
7982                     $matches_comment .= "$conflicting\n";
7983                 }
7984             }
7985             if (@description) {
7986                 $matches_comment .= "\n    Meaning: "
7987                                     . join('; ', @description)
7988                                     . "\n";
7989             }
7990             if (@note) {
7991                 $matches_comment .= "\n    Note: "
7992                                     . join("\n    ", @note)
7993                                     . "\n";
7994             }
7995         } # End of looping through all tables
7996
7997         $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
7998
7999
8000         my $code_points;
8001         my $match;
8002         my $any_of_these;
8003         if ($unicode_count == 1) {
8004             $match = 'matches';
8005             $code_points = 'single code point';
8006         }
8007         else {
8008             $match = 'match';
8009             $code_points = "$string_count code points";
8010         }
8011
8012         my $synonyms;
8013         my $entries;
8014         if ($total_entries == 1) {
8015             $synonyms = "";
8016             $entries = 'entry';
8017             $any_of_these = 'this'
8018         }
8019         else {
8020             $synonyms = " any of the following regular expression constructs";
8021             $entries = 'entries';
8022             $any_of_these = 'any of these'
8023         }
8024
8025         my $comment = "";
8026         if ($has_ucd_alias) {
8027             $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8028         }
8029         if ($has_unrelated) {
8030             $comment .= <<END;
8031 This file is for tables that are not necessarily related:  To conserve
8032 resources, every table that matches the identical set of code points in this
8033 version of Unicode uses this file.  Each one is listed in a separate group
8034 below.  It could be that the tables will match the same set of code points in
8035 other Unicode releases, or it could be purely coincidence that they happen to
8036 be the same in Unicode $string_version, and hence may not in other versions.
8037
8038 END
8039         }
8040
8041         if (%flags) {
8042             foreach my $flag (sort keys %flags) {
8043                 $comment .= <<END;
8044 '$flag' below means that this form is $flags{$flag}.
8045 Consult $pod_file.pod
8046 END
8047             }
8048             $comment .= "\n";
8049         }
8050
8051         if ($total_entries == 0) {
8052             Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
8053             $comment .= <<END;
8054 This file returns the $code_points in Unicode Version
8055 $string_version for
8056 $leader, but it is inaccessible through Perl regular expressions, as
8057 "\\p{prop=}" is not recognized.
8058 END
8059
8060         } else {
8061             $comment .= <<END;
8062 This file returns the $code_points in Unicode Version
8063 $string_version that
8064 $match$synonyms:
8065
8066 $matches_comment
8067 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
8068 including if adding or subtracting white space, underscore, and hyphen
8069 characters matters or doesn't matter, and other permissible syntactic
8070 variants.  Upper/lower case distinctions never matter.
8071 END
8072
8073         }
8074         if ($compound_name) {
8075             $comment .= <<END;
8076
8077 A colon can be substituted for the equals sign, and
8078 END
8079             if ($properties_with_compound_names > 1) {
8080                 $comment .= <<END;
8081 within each group above,
8082 END
8083             }
8084             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8085
8086             # Note the \b below, it says don't make that line a continuation.
8087             $comment .= <<END;
8088 anything to the left of the equals (or colon) can be combined with anything to
8089 the right.  Thus, for example,
8090 $compound_name
8091 \bis also valid.
8092 END
8093         }
8094
8095         # And append any comment(s) from the actual tables.  They are all
8096         # gathered here, so may not read all that well.
8097         if (@global_comments) {
8098             $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8099         }
8100
8101         if ($count) {   # The format differs if no code points, and needs no
8102                         # explanation in that case
8103             if ($leader->write_as_invlist) {
8104                 $comment.= <<END;
8105
8106 The first data line of this file begins with the letter V to indicate it is in
8107 inversion list format.  The number following the V gives the number of lines
8108 remaining.  Each of those remaining lines is a single number representing the
8109 starting code point of a range which goes up to but not including the number
8110 on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8111 the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8112 the property.  The final line's range extends to the platform's infinity.
8113 END
8114             }
8115             else {
8116                 $comment.= <<END;
8117 The format of the lines of this file is:
8118 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8119 STOP is the ending point, or if omitted, the range has just one code point.
8120 END
8121             }
8122             if ($leader->output_range_counts) {
8123                 $comment .= <<END;
8124 Numbers in comments in [brackets] indicate how many code points are in the
8125 range.
8126 END
8127             }
8128         }
8129
8130         $leader->set_comment(main::join_lines($comment));
8131         return;
8132     }
8133
8134     # Accessors for the underlying list
8135     for my $sub (qw(
8136                     get_valid_code_point
8137                     get_invalid_code_point
8138                 ))
8139     {
8140         no strict "refs";
8141         *$sub = sub {
8142             use strict "refs";
8143             my $self = shift;
8144
8145             return $self->_range_list->$sub(@_);
8146         }
8147     }
8148 } # End closure for Match_Table
8149
8150 package Property;
8151
8152 # The Property class represents a Unicode property, or the $perl
8153 # pseudo-property.  It contains a map table initialized empty at construction
8154 # time, and for properties accessible through regular expressions, various
8155 # match tables, created through the add_match_table() method, and referenced
8156 # by the table('NAME') or tables() methods, the latter returning a list of all
8157 # of the match tables.  Otherwise table operations implicitly are for the map
8158 # table.
8159 #
8160 # Most of the data in the property is actually about its map table, so it
8161 # mostly just uses that table's accessors for most methods.  The two could
8162 # have been combined into one object, but for clarity because of their
8163 # differing semantics, they have been kept separate.  It could be argued that
8164 # the 'file' and 'directory' fields should be kept with the map table.
8165 #
8166 # Each property has a type.  This can be set in the constructor, or in the
8167 # set_type accessor, but mostly it is figured out by the data.  Every property
8168 # starts with unknown type, overridden by a parameter to the constructor, or
8169 # as match tables are added, or ranges added to the map table, the data is
8170 # inspected, and the type changed.  After the table is mostly or entirely
8171 # filled, compute_type() should be called to finalize they analysis.
8172 #
8173 # There are very few operations defined.  One can safely remove a range from
8174 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
8175 # table to this one, replacing any in the intersection of the two.
8176
8177 sub standardize { return main::standardize($_[0]); }
8178 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8179
8180 {   # Closure
8181
8182     # This hash will contain as keys, all the aliases of all properties, and
8183     # as values, pointers to their respective property objects.  This allows
8184     # quick look-up of a property from any of its names.
8185     my %alias_to_property_of;
8186
8187     sub dump_alias_to_property_of {
8188         # For debugging
8189
8190         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8191         return;
8192     }
8193
8194     sub property_ref {
8195         # This is a package subroutine, not called as a method.
8196         # If the single parameter is a literal '*' it returns a list of all
8197         # defined properties.
8198         # Otherwise, the single parameter is a name, and it returns a pointer
8199         # to the corresponding property object, or undef if none.
8200         #
8201         # Properties can have several different names.  The 'standard' form of
8202         # each of them is stored in %alias_to_property_of as they are defined.
8203         # But it's possible that this subroutine will be called with some
8204         # variant, so if the initial lookup fails, it is repeated with the
8205         # standardized form of the input name.  If found, besides returning the
8206         # result, the input name is added to the list so future calls won't
8207         # have to do the conversion again.
8208
8209         my $name = shift;
8210
8211         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8212
8213         if (! defined $name) {
8214             Carp::my_carp_bug("Undefined input property.  No action taken.");
8215             return;
8216         }
8217
8218         return main::uniques(values %alias_to_property_of) if $name eq '*';
8219
8220         # Return cached result if have it.
8221         my $result = $alias_to_property_of{$name};
8222         return $result if defined $result;
8223
8224         # Convert the input to standard form.
8225         my $standard_name = standardize($name);
8226
8227         $result = $alias_to_property_of{$standard_name};
8228         return unless defined $result;        # Don't cache undefs
8229
8230         # Cache the result before returning it.
8231         $alias_to_property_of{$name} = $result;
8232         return $result;
8233     }
8234
8235
8236     main::setup_package();
8237
8238     my %map;
8239     # A pointer to the map table object for this property
8240     main::set_access('map', \%map);
8241
8242     my %full_name;
8243     # The property's full name.  This is a duplicate of the copy kept in the
8244     # map table, but is needed because stringify needs it during
8245     # construction of the map table, and then would have a chicken before egg
8246     # problem.
8247     main::set_access('full_name', \%full_name, 'r');
8248
8249     my %table_ref;
8250     # This hash will contain as keys, all the aliases of any match tables
8251     # attached to this property, and as values, the pointers to their
8252     # respective tables.  This allows quick look-up of a table from any of its
8253     # names.
8254     main::set_access('table_ref', \%table_ref);
8255
8256     my %type;
8257     # The type of the property, $ENUM, $BINARY, etc
8258     main::set_access('type', \%type, 'r');
8259
8260     my %file;
8261     # The filename where the map table will go (if actually written).
8262     # Normally defaulted, but can be overridden.
8263     main::set_access('file', \%file, 'r', 's');
8264
8265     my %directory;
8266     # The directory where the map table will go (if actually written).
8267     # Normally defaulted, but can be overridden.
8268     main::set_access('directory', \%directory, 's');
8269
8270     my %pseudo_map_type;
8271     # This is used to affect the calculation of the map types for all the
8272     # ranges in the table.  It should be set to one of the values that signify
8273     # to alter the calculation.
8274     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
8275
8276     my %has_only_code_point_maps;
8277     # A boolean used to help in computing the type of data in the map table.
8278     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
8279
8280     my %unique_maps;
8281     # A list of the first few distinct mappings this property has.  This is
8282     # used to disambiguate between binary and enum property types, so don't
8283     # have to keep more than three.
8284     main::set_access('unique_maps', \%unique_maps);
8285
8286     my %pre_declared_maps;
8287     # A boolean that gives whether the input data should declare all the
8288     # tables used, or not.  If the former, unknown ones raise a warning.
8289     main::set_access('pre_declared_maps',
8290                                     \%pre_declared_maps, 'r', 's');
8291
8292     sub new {
8293         # The only required parameter is the positionally first, name.  All
8294         # other parameters are key => value pairs.  See the documentation just
8295         # above for the meanings of the ones not passed directly on to the map
8296         # table constructor.
8297
8298         my $class = shift;
8299         my $name = shift || "";
8300
8301         my $self = property_ref($name);
8302         if (defined $self) {
8303             my $options_string = join ", ", @_;
8304             $options_string = ".  Ignoring options $options_string" if $options_string;
8305             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
8306             return $self;
8307         }
8308
8309         my %args = @_;
8310
8311         $self = bless \do { my $anonymous_scalar }, $class;
8312         my $addr = do { no overloading; pack 'J', $self; };
8313
8314         $directory{$addr} = delete $args{'Directory'};
8315         $file{$addr} = delete $args{'File'};
8316         $full_name{$addr} = delete $args{'Full_Name'} || $name;
8317         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
8318         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
8319         $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
8320                                     # Starting in this release, property
8321                                     # values should be defined for all
8322                                     # properties, except those overriding this
8323                                     // $v_version ge v5.1.0;
8324
8325         # Rest of parameters passed on.
8326
8327         $has_only_code_point_maps{$addr} = 1;
8328         $table_ref{$addr} = { };
8329         $unique_maps{$addr} = { };
8330
8331         $map{$addr} = Map_Table->new($name,
8332                                     Full_Name => $full_name{$addr},
8333                                     _Alias_Hash => \%alias_to_property_of,
8334                                     _Property => $self,
8335                                     %args);
8336         return $self;
8337     }
8338
8339     # See this program's beginning comment block about overloading the copy
8340     # constructor.  Few operations are defined on properties, but a couple are
8341     # useful.  It is safe to take the inverse of a property, and to remove a
8342     # single code point from it.
8343     use overload
8344         fallback => 0,
8345         qw("") => "_operator_stringify",
8346         "." => \&main::_operator_dot,
8347         ".=" => \&main::_operator_dot_equal,
8348         '==' => \&main::_operator_equal,
8349         '!=' => \&main::_operator_not_equal,
8350         '=' => sub { return shift },
8351         '-=' => "_minus_and_equal",
8352     ;
8353
8354     sub _operator_stringify {
8355         return "Property '" .  shift->full_name . "'";
8356     }
8357
8358     sub _minus_and_equal {
8359         # Remove a single code point from the map table of a property.
8360
8361         my $self = shift;
8362         my $other = shift;
8363         my $reversed = shift;
8364         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8365
8366         if (ref $other) {
8367             Carp::my_carp_bug("Bad news.  Can't cope with a "
8368                         . ref($other)
8369                         . " argument to '-='.  Subtraction ignored.");
8370             return $self;
8371         }
8372         elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
8373             Carp::my_carp_bug("Bad news.  Can't cope with subtracting a "
8374             . ref $self
8375             . " from a non-object.  undef returned.");
8376             return;
8377         }
8378         else {
8379             no overloading;
8380             $map{pack 'J', $self}->delete_range($other, $other);
8381         }
8382         return $self;
8383     }
8384
8385     sub add_match_table {
8386         # Add a new match table for this property, with name given by the
8387         # parameter.  It returns a pointer to the table.
8388
8389         my $self = shift;
8390         my $name = shift;
8391         my %args = @_;
8392
8393         my $addr = do { no overloading; pack 'J', $self; };
8394
8395         my $table = $table_ref{$addr}{$name};
8396         my $standard_name = main::standardize($name);
8397         if (defined $table
8398             || (defined ($table = $table_ref{$addr}{$standard_name})))
8399         {
8400             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
8401             $table_ref{$addr}{$name} = $table;
8402             return $table;
8403         }
8404         else {
8405
8406             # See if this is a perl extension, if not passed in.
8407             my $perl_extension = delete $args{'Perl_Extension'};
8408             $perl_extension
8409                         = $self->perl_extension if ! defined $perl_extension;
8410
8411             $table = Match_Table->new(
8412                                 Name => $name,
8413                                 Perl_Extension => $perl_extension,
8414                                 _Alias_Hash => $table_ref{$addr},
8415                                 _Property => $self,
8416
8417                                 # gets property's fate and status by default,
8418                                 # except if the name begind with an
8419                                 # underscore, default it to internal
8420                                 Fate => ($name =~ /^_/)
8421                                          ? $INTERNAL_ONLY
8422                                          : $self->fate,
8423                                 Status => $self->status,
8424                                 _Status_Info => $self->status_info,
8425                                 %args);
8426             return unless defined $table;
8427         }
8428
8429         # Save the names for quick look up
8430         $table_ref{$addr}{$standard_name} = $table;
8431         $table_ref{$addr}{$name} = $table;
8432
8433         # Perhaps we can figure out the type of this property based on the
8434         # fact of adding this match table.  First, string properties don't
8435         # have match tables; second, a binary property can't have 3 match
8436         # tables
8437         if ($type{$addr} == $UNKNOWN) {
8438             $type{$addr} = $NON_STRING;
8439         }
8440         elsif ($type{$addr} == $STRING) {
8441             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
8442             $type{$addr} = $NON_STRING;
8443         }
8444         elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
8445             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
8446                 if ($type{$addr} == $BINARY) {
8447                     Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
8448                 }
8449                 $type{$addr} = $ENUM;
8450             }
8451         }
8452
8453         return $table;
8454     }
8455
8456     sub delete_match_table {
8457         # Delete the table referred to by $2 from the property $1.
8458
8459         my $self = shift;
8460         my $table_to_remove = shift;
8461         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8462
8463         my $addr = do { no overloading; pack 'J', $self; };
8464
8465         # Remove all names that refer to it.
8466         foreach my $key (keys %{$table_ref{$addr}}) {
8467             delete $table_ref{$addr}{$key}
8468                                 if $table_ref{$addr}{$key} == $table_to_remove;
8469         }
8470
8471         $table_to_remove->DESTROY;
8472         return;
8473     }
8474
8475     sub table {
8476         # Return a pointer to the match table (with name given by the
8477         # parameter) associated with this property; undef if none.
8478
8479         my $self = shift;
8480         my $name = shift;
8481         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8482
8483         my $addr = do { no overloading; pack 'J', $self; };
8484
8485         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
8486
8487         # If quick look-up failed, try again using the standard form of the
8488         # input name.  If that succeeds, cache the result before returning so
8489         # won't have to standardize this input name again.
8490         my $standard_name = main::standardize($name);
8491         return unless defined $table_ref{$addr}{$standard_name};
8492
8493         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
8494         return $table_ref{$addr}{$name};
8495     }
8496
8497     sub tables {
8498         # Return a list of pointers to all the match tables attached to this
8499         # property
8500
8501         no overloading;
8502         return main::uniques(values %{$table_ref{pack 'J', shift}});
8503     }
8504
8505     sub directory {
8506         # Returns the directory the map table for this property should be
8507         # output in.  If a specific directory has been specified, that has
8508         # priority;  'undef' is returned if the type isn't defined;
8509         # or $map_directory for everything else.
8510
8511         my $addr = do { no overloading; pack 'J', shift; };
8512
8513         return $directory{$addr} if defined $directory{$addr};
8514         return undef if $type{$addr} == $UNKNOWN;
8515         return $map_directory;
8516     }
8517
8518     sub swash_name {
8519         # Return the name that is used to both:
8520         #   1)  Name the file that the map table is written to.
8521         #   2)  The name of swash related stuff inside that file.
8522         # The reason for this is that the Perl core historically has used
8523         # certain names that aren't the same as the Unicode property names.
8524         # To continue using these, $file is hard-coded in this file for those,
8525         # but otherwise the standard name is used.  This is different from the
8526         # external_name, so that the rest of the files, like in lib can use
8527         # the standard name always, without regard to historical precedent.
8528
8529         my $self = shift;
8530         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8531
8532         my $addr = do { no overloading; pack 'J', $self; };
8533
8534         # Swash names are used only on either
8535         # 1) legacy-only properties, because the formats for these are
8536         #    unchangeable, and they have had these lines in them; or
8537         # 2) regular map tables; otherwise there should be no access to the
8538         #    property map table from other parts of Perl.
8539         return if $map{$addr}->fate != $ORDINARY
8540                   && $map{$addr}->fate != $LEGACY_ONLY;
8541
8542         return $file{$addr} if defined $file{$addr};
8543         return $map{$addr}->external_name;
8544     }
8545
8546     sub to_create_match_tables {
8547         # Returns a boolean as to whether or not match tables should be
8548         # created for this property.
8549
8550         my $self = shift;
8551         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8552
8553         # The whole point of this pseudo property is match tables.
8554         return 1 if $self == $perl;
8555
8556         my $addr = do { no overloading; pack 'J', $self; };
8557
8558         # Don't generate tables of code points that match the property values
8559         # of a string property.  Such a list would most likely have many
8560         # property values, each with just one or very few code points mapping
8561         # to it.
8562         return 0 if $type{$addr} == $STRING;
8563
8564         # Don't generate anything for unimplemented properties.
8565         return 0 if grep { $self->complete_name eq $_ }
8566                                                     @unimplemented_properties;
8567         # Otherwise, do.
8568         return 1;
8569     }
8570
8571     sub property_add_or_replace_non_nulls {
8572         # This adds the mappings in the property $other to $self.  Non-null
8573         # mappings from $other override those in $self.  It essentially merges
8574         # the two properties, with the second having priority except for null
8575         # mappings.
8576
8577         my $self = shift;
8578         my $other = shift;
8579         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8580
8581         if (! $other->isa(__PACKAGE__)) {
8582             Carp::my_carp_bug("$other should be a "
8583                             . __PACKAGE__
8584                             . ".  Not a '"
8585                             . ref($other)
8586                             . "'.  Not added;");
8587             return;
8588         }
8589
8590         no overloading;
8591         return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
8592     }
8593
8594     sub set_proxy_for {
8595         # Certain tables are not generally written out to files, but
8596         # Unicode::UCD has the intelligence to know that the file for $self
8597         # can be used to reconstruct those tables.  This routine just changes
8598         # things so that UCD pod entries for those suppressed tables are
8599         # generated, so the fact that a proxy is used is invisible to the
8600         # user.
8601
8602         my $self = shift;
8603
8604         foreach my $property_name (@_) {
8605             my $ref = property_ref($property_name);
8606             next if $ref->to_output_map;
8607             $ref->set_fate($MAP_PROXIED);
8608         }
8609     }
8610
8611     sub set_type {
8612         # Set the type of the property.  Mostly this is figured out by the
8613         # data in the table.  But this is used to set it explicitly.  The
8614         # reason it is not a standard accessor is that when setting a binary
8615         # property, we need to make sure that all the true/false aliases are
8616         # present, as they were omitted in early Unicode releases.
8617
8618         my $self = shift;
8619         my $type = shift;
8620         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8621
8622         if ($type != $ENUM
8623             && $type != $BINARY
8624             && $type != $FORCED_BINARY
8625             && $type != $STRING)
8626         {
8627             Carp::my_carp("Unrecognized type '$type'.  Type not set");
8628             return;
8629         }
8630
8631         { no overloading; $type{pack 'J', $self} = $type; }
8632         return if $type != $BINARY && $type != $FORCED_BINARY;
8633
8634         my $yes = $self->table('Y');
8635         $yes = $self->table('Yes') if ! defined $yes;
8636         $yes = $self->add_match_table('Y', Full_Name => 'Yes')
8637                                                             if ! defined $yes;
8638
8639         # Add aliases in order wanted, duplicates will be ignored.  We use a
8640         # binary property present in all releases for its ordered lists of
8641         # true/false aliases.  Note, that could run into problems in
8642         # outputting things in that we don't distinguish between the name and
8643         # full name of these.  Hopefully, if the table was already created
8644         # before this code is executed, it was done with these set properly.
8645         my $bm = property_ref("Bidi_Mirrored");
8646         foreach my $alias ($bm->table("Y")->aliases) {
8647             $yes->add_alias($alias->name);
8648         }
8649         my $no = $self->table('N');
8650         $no = $self->table('No') if ! defined $no;
8651         $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
8652         foreach my $alias ($bm->table("N")->aliases) {
8653             $no->add_alias($alias->name);
8654         }
8655
8656         return;
8657     }
8658
8659     sub add_map {
8660         # Add a map to the property's map table.  This also keeps
8661         # track of the maps so that the property type can be determined from
8662         # its data.
8663
8664         my $self = shift;
8665         my $start = shift;  # First code point in range
8666         my $end = shift;    # Final code point in range
8667         my $map = shift;    # What the range maps to.
8668         # Rest of parameters passed on.
8669
8670         my $addr = do { no overloading; pack 'J', $self; };
8671
8672         # If haven't the type of the property, gather information to figure it
8673         # out.
8674         if ($type{$addr} == $UNKNOWN) {
8675
8676             # If the map contains an interior blank or dash, or most other
8677             # nonword characters, it will be a string property.  This
8678             # heuristic may actually miss some string properties.  If so, they
8679             # may need to have explicit set_types called for them.  This
8680             # happens in the Unihan properties.
8681             if ($map =~ / (?<= . ) [ -] (?= . ) /x
8682                 || $map =~ / [^\w.\/\ -]  /x)
8683             {
8684                 $self->set_type($STRING);
8685
8686                 # $unique_maps is used for disambiguating between ENUM and
8687                 # BINARY later; since we know the property is not going to be
8688                 # one of those, no point in keeping the data around
8689                 undef $unique_maps{$addr};
8690             }
8691             else {
8692
8693                 # Not necessarily a string.  The final decision has to be
8694                 # deferred until all the data are in.  We keep track of if all
8695                 # the values are code points for that eventual decision.
8696                 $has_only_code_point_maps{$addr} &=
8697                                             $map =~ / ^ $code_point_re $/x;
8698
8699                 # For the purposes of disambiguating between binary and other
8700                 # enumerations at the end, we keep track of the first three
8701                 # distinct property values.  Once we get to three, we know
8702                 # it's not going to be binary, so no need to track more.
8703                 if (scalar keys %{$unique_maps{$addr}} < 3) {
8704                     $unique_maps{$addr}{main::standardize($map)} = 1;
8705                 }
8706             }
8707         }
8708
8709         # Add the mapping by calling our map table's method
8710         return $map{$addr}->add_map($start, $end, $map, @_);
8711     }
8712
8713     sub compute_type {
8714         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
8715         # should be called after the property is mostly filled with its maps.
8716         # We have been keeping track of what the property values have been,
8717         # and now have the necessary information to figure out the type.
8718
8719         my $self = shift;
8720         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8721
8722         my $addr = do { no overloading; pack 'J', $self; };
8723
8724         my $type = $type{$addr};
8725
8726         # If already have figured these out, no need to do so again, but we do
8727         # a double check on ENUMS to make sure that a string property hasn't
8728         # improperly been classified as an ENUM, so continue on with those.
8729         return if $type == $STRING
8730                   || $type == $BINARY
8731                   || $type == $FORCED_BINARY;
8732
8733         # If every map is to a code point, is a string property.
8734         if ($type == $UNKNOWN
8735             && ($has_only_code_point_maps{$addr}
8736                 || (defined $map{$addr}->default_map
8737                     && $map{$addr}->default_map eq "")))
8738         {
8739             $self->set_type($STRING);
8740         }
8741         else {
8742
8743             # Otherwise, it is to some sort of enumeration.  (The case where
8744             # it is a Unicode miscellaneous property, and treated like a
8745             # string in this program is handled in add_map()).  Distinguish
8746             # between binary and some other enumeration type.  Of course, if
8747             # there are more than two values, it's not binary.  But more
8748             # subtle is the test that the default mapping is defined means it
8749             # isn't binary.  This in fact may change in the future if Unicode
8750             # changes the way its data is structured.  But so far, no binary
8751             # properties ever have @missing lines for them, so the default map
8752             # isn't defined for them.  The few properties that are two-valued
8753             # and aren't considered binary have the default map defined
8754             # starting in Unicode 5.0, when the @missing lines appeared; and
8755             # this program has special code to put in a default map for them
8756             # for earlier than 5.0 releases.
8757             if ($type == $ENUM
8758                 || scalar keys %{$unique_maps{$addr}} > 2
8759                 || defined $self->default_map)
8760             {
8761                 my $tables = $self->tables;
8762                 my $count = $self->count;
8763                 if ($verbosity && $tables > 500 && $tables/$count > .1) {
8764                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
8765                 }
8766                 $self->set_type($ENUM);
8767             }
8768             else {
8769                 $self->set_type($BINARY);
8770             }
8771         }
8772         undef $unique_maps{$addr};  # Garbage collect
8773         return;
8774     }
8775
8776     sub set_fate {
8777         my $self = shift;
8778         my $fate = shift;
8779         my $reason = shift;  # Ignored unless suppressing
8780         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8781
8782         my $addr = do { no overloading; pack 'J', $self; };
8783         if ($fate == $SUPPRESSED) {
8784             $why_suppressed{$self->complete_name} = $reason;
8785         }
8786
8787         # Each table shares the property's fate, except that MAP_PROXIED
8788         # doesn't affect match tables
8789         $map{$addr}->set_fate($fate, $reason);
8790         if ($fate != $MAP_PROXIED) {
8791             foreach my $table ($map{$addr}, $self->tables) {
8792                 $table->set_fate($fate, $reason);
8793             }
8794         }
8795         return;
8796     }
8797
8798
8799     # Most of the accessors for a property actually apply to its map table.
8800     # Setup up accessor functions for those, referring to %map
8801     for my $sub (qw(
8802                     add_alias
8803                     add_anomalous_entry
8804                     add_comment
8805                     add_conflicting
8806                     add_description
8807                     add_duplicate
8808                     add_note
8809                     aliases
8810                     comment
8811                     complete_name
8812                     containing_range
8813                     count
8814                     default_map
8815                     delete_range
8816                     description
8817                     each_range
8818                     external_name
8819                     fate
8820                     file_path
8821                     format
8822                     initialize
8823                     inverse_list
8824                     is_empty
8825                     replacement_property
8826                     name
8827                     note
8828                     perl_extension
8829                     property
8830                     range_count
8831                     ranges
8832                     range_size_1
8833                     reset_each_range
8834                     set_comment
8835                     set_default_map
8836                     set_file_path
8837                     set_final_comment
8838                     _set_format
8839                     set_range_size_1
8840                     set_status
8841                     set_to_output_map
8842                     short_name
8843                     status
8844                     status_info
8845                     to_output_map
8846                     type_of
8847                     value_of
8848                     write
8849                 ))
8850                     # 'property' above is for symmetry, so that one can take
8851                     # the property of a property and get itself, and so don't
8852                     # have to distinguish between properties and tables in
8853                     # calling code
8854     {
8855         no strict "refs";
8856         *$sub = sub {
8857             use strict "refs";
8858             my $self = shift;
8859             no overloading;
8860             return $map{pack 'J', $self}->$sub(@_);
8861         }
8862     }
8863
8864
8865 } # End closure
8866
8867 package main;
8868
8869     sub display_chr {
8870         # Converts an ordinal character value to a displayable string, using a
8871         # NBSP to hold combining characters.
8872
8873         my $ord = shift;
8874         my $chr = chr $ord;
8875         return $chr if $ccc->table(0)->contains($ord);
8876         return chr(utf8::unicode_to_native(0xA0)) . $chr;
8877     }
8878
8879 sub join_lines($) {
8880     # Returns lines of the input joined together, so that they can be folded
8881     # properly.
8882     # This causes continuation lines to be joined together into one long line
8883     # for folding.  A continuation line is any line that doesn't begin with a
8884     # space or "\b" (the latter is stripped from the output).  This is so
8885     # lines can be be in a HERE document so as to fit nicely in the terminal
8886     # width, but be joined together in one long line, and then folded with
8887     # indents, '#' prefixes, etc, properly handled.
8888     # A blank separates the joined lines except if there is a break; an extra
8889     # blank is inserted after a period ending a line.
8890
8891     # Initialize the return with the first line.
8892     my ($return, @lines) = split "\n", shift;
8893
8894     # If the first line is null, it was an empty line, add the \n back in
8895     $return = "\n" if $return eq "";
8896
8897     # Now join the remainder of the physical lines.
8898     for my $line (@lines) {
8899
8900         # An empty line means wanted a blank line, so add two \n's to get that
8901         # effect, and go to the next line.
8902         if (length $line == 0) {
8903             $return .= "\n\n";
8904             next;
8905         }
8906
8907         # Look at the last character of what we have so far.
8908         my $previous_char = substr($return, -1, 1);
8909
8910         # And at the next char to be output.
8911         my $next_char = substr($line, 0, 1);
8912
8913         if ($previous_char ne "\n") {
8914
8915             # Here didn't end wth a nl.  If the next char a blank or \b, it
8916             # means that here there is a break anyway.  So add a nl to the
8917             # output.
8918             if ($next_char eq " " || $next_char eq "\b") {
8919                 $previous_char = "\n";
8920                 $return .= $previous_char;
8921             }
8922
8923             # Add an extra space after periods.
8924             $return .= " " if $previous_char eq '.';
8925         }
8926
8927         # Here $previous_char is still the latest character to be output.  If
8928         # it isn't a nl, it means that the next line is to be a continuation
8929         # line, with a blank inserted between them.
8930         $return .= " " if $previous_char ne "\n";
8931
8932         # Get rid of any \b
8933         substr($line, 0, 1) = "" if $next_char eq "\b";
8934
8935         # And append this next line.
8936         $return .= $line;
8937     }
8938
8939     return $return;
8940 }
8941
8942 sub simple_fold($;$$$) {
8943     # Returns a string of the input (string or an array of strings) folded
8944     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8945     # a \n
8946     # This is tailored for the kind of text written by this program,
8947     # especially the pod file, which can have very long names with
8948     # underscores in the middle, or words like AbcDefgHij....  We allow
8949     # breaking in the middle of such constructs if the line won't fit
8950     # otherwise.  The break in such cases will come either just after an
8951     # underscore, or just before one of the Capital letters.
8952
8953     local $to_trace = 0 if main::DEBUG;
8954
8955     my $line = shift;
8956     my $prefix = shift;     # Optional string to prepend to each output
8957                             # line
8958     $prefix = "" unless defined $prefix;
8959
8960     my $hanging_indent = shift; # Optional number of spaces to indent
8961                                 # continuation lines
8962     $hanging_indent = 0 unless $hanging_indent;
8963
8964     my $right_margin = shift;   # Optional number of spaces to narrow the
8965                                 # total width by.
8966     $right_margin = 0 unless defined $right_margin;
8967
8968     # Call carp with the 'nofold' option to avoid it from trying to call us
8969     # recursively
8970     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8971
8972     # The space available doesn't include what's automatically prepended
8973     # to each line, or what's reserved on the right.
8974     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8975     # XXX Instead of using the 'nofold' perhaps better to look up the stack
8976
8977     if (DEBUG && $hanging_indent >= $max) {
8978         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
8979         $hanging_indent = 0;
8980     }
8981
8982     # First, split into the current physical lines.
8983     my @line;
8984     if (ref $line) {        # Better be an array, because not bothering to
8985                             # test
8986         foreach my $line (@{$line}) {
8987             push @line, split /\n/, $line;
8988         }
8989     }
8990     else {
8991         @line = split /\n/, $line;
8992     }
8993
8994     #local $to_trace = 1 if main::DEBUG;
8995     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8996
8997     # Look at each current physical line.
8998     for (my $i = 0; $i < @line; $i++) {
8999         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9000         #local $to_trace = 1 if main::DEBUG;
9001         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9002
9003         # Remove prefix, because will be added back anyway, don't want
9004         # doubled prefix
9005         $line[$i] =~ s/^$prefix//;
9006
9007         # Remove trailing space
9008         $line[$i] =~ s/\s+\Z//;
9009
9010         # If the line is too long, fold it.
9011         if (length $line[$i] > $max) {
9012             my $remainder;
9013
9014             # Here needs to fold.  Save the leading space in the line for
9015             # later.
9016             $line[$i] =~ /^ ( \s* )/x;
9017             my $leading_space = $1;
9018             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9019
9020             # If character at final permissible position is white space,
9021             # fold there, which will delete that white space
9022             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9023                 $remainder = substr($line[$i], $max);
9024                 $line[$i] = substr($line[$i], 0, $max - 1);
9025             }
9026             else {
9027
9028                 # Otherwise fold at an acceptable break char closest to
9029                 # the max length.  Look at just the maximal initial
9030                 # segment of the line
9031                 my $segment = substr($line[$i], 0, $max - 1);
9032                 if ($segment =~
9033                     /^ ( .{$hanging_indent}   # Don't look before the
9034                                               #  indent.
9035                         \ *                   # Don't look in leading
9036                                               #  blanks past the indent
9037                             [^ ] .*           # Find the right-most
9038                         (?:                   #  acceptable break:
9039                             [ \s = ]          # space or equal
9040                             | - (?! [.0-9] )  # or non-unary minus.
9041                         )                     # $1 includes the character
9042                     )/x)
9043                 {
9044                     # Split into the initial part that fits, and remaining
9045                     # part of the input
9046                     $remainder = substr($line[$i], length $1);
9047                     $line[$i] = $1;
9048                     trace $line[$i] if DEBUG && $to_trace;
9049                     trace $remainder if DEBUG && $to_trace;
9050                 }
9051
9052                 # If didn't find a good breaking spot, see if there is a
9053                 # not-so-good breaking spot.  These are just after
9054                 # underscores or where the case changes from lower to
9055                 # upper.  Use \a as a soft hyphen, but give up
9056                 # and don't break the line if there is actually a \a
9057                 # already in the input.  We use an ascii character for the
9058                 # soft-hyphen to avoid any attempt by miniperl to try to
9059                 # access the files that this program is creating.
9060                 elsif ($segment !~ /\a/
9061                        && ($segment =~ s/_/_\a/g
9062                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
9063                 {
9064                     # Here were able to find at least one place to insert
9065                     # our substitute soft hyphen.  Find the right-most one
9066                     # and replace it by a real hyphen.
9067                     trace $segment if DEBUG && $to_trace;
9068                     substr($segment,
9069                             rindex($segment, "\a"),
9070                             1) = '-';
9071
9072                     # Then remove the soft hyphen substitutes.
9073                     $segment =~ s/\a//g;
9074                     trace $segment if DEBUG && $to_trace;
9075
9076                     # And split into the initial part that fits, and
9077                     # remainder of the line
9078                     my $pos = rindex($segment, '-');
9079                     $remainder = substr($line[$i], $pos);
9080                     trace $remainder if DEBUG && $to_trace;
9081                     $line[$i] = substr($segment, 0, $pos + 1);
9082                 }
9083             }
9084
9085             # Here we know if we can fold or not.  If we can, $remainder
9086             # is what remains to be processed in the next iteration.
9087             if (defined $remainder) {
9088                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9089
9090                 # Insert the folded remainder of the line as a new element
9091                 # of the array.  (It may still be too long, but we will
9092                 # deal with that next time through the loop.)  Omit any
9093                 # leading space in the remainder.
9094                 $remainder =~ s/^\s+//;
9095                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9096
9097                 # But then indent by whichever is larger of:
9098                 # 1) the leading space on the input line;
9099                 # 2) the hanging indent.
9100                 # This preserves indentation in the original line.
9101                 my $lead = ($leading_space)
9102                             ? length $leading_space
9103                             : $hanging_indent;
9104                 $lead = max($lead, $hanging_indent);
9105                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
9106             }
9107         }
9108
9109         # Ready to output the line. Get rid of any trailing space
9110         # And prefix by the required $prefix passed in.
9111         $line[$i] =~ s/\s+$//;
9112         $line[$i] = "$prefix$line[$i]\n";
9113     } # End of looping through all the lines.
9114
9115     return join "", @line;
9116 }
9117
9118 sub property_ref {  # Returns a reference to a property object.
9119     return Property::property_ref(@_);
9120 }
9121
9122 sub force_unlink ($) {
9123     my $filename = shift;
9124     return unless file_exists($filename);
9125     return if CORE::unlink($filename);
9126
9127     # We might need write permission
9128     chmod 0777, $filename;
9129     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
9130     return;
9131 }
9132
9133 sub write ($$@) {
9134     # Given a filename and references to arrays of lines, write the lines of
9135     # each array to the file
9136     # Filename can be given as an arrayref of directory names
9137
9138     return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
9139
9140     my $file  = shift;
9141     my $use_utf8 = shift;
9142
9143     # Get into a single string if an array, and get rid of, in Unix terms, any
9144     # leading '.'
9145     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9146     $file = File::Spec->canonpath($file);
9147
9148     # If has directories, make sure that they all exist
9149     (undef, my $directories, undef) = File::Spec->splitpath($file);
9150     File::Path::mkpath($directories) if $directories && ! -d $directories;
9151
9152     push @files_actually_output, $file;
9153
9154     force_unlink ($file);
9155
9156     my $OUT;
9157     if (not open $OUT, ">", $file) {
9158         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
9159         return;
9160     }
9161
9162     binmode $OUT, ":utf8" if $use_utf8;
9163
9164     while (defined (my $lines_ref = shift)) {
9165         unless (@$lines_ref) {
9166             Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9167         }
9168
9169         print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9170     }
9171     close $OUT or die Carp::my_carp("close '$file' failed: $!");
9172
9173     print "$file written.\n" if $verbosity >= $VERBOSE;
9174
9175     return;
9176 }
9177
9178
9179 sub Standardize($) {
9180     # This converts the input name string into a standardized equivalent to
9181     # use internally.
9182
9183     my $name = shift;
9184     unless (defined $name) {
9185       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
9186       return;
9187     }
9188
9189     # Remove any leading or trailing white space
9190     $name =~ s/^\s+//g;
9191     $name =~ s/\s+$//g;
9192
9193     # Convert interior white space and hyphens into underscores.
9194     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
9195
9196     # Capitalize the letter following an underscore, and convert a sequence of
9197     # multiple underscores to a single one
9198     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
9199
9200     # And capitalize the first letter, but not for the special cjk ones.
9201     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
9202     return $name;
9203 }
9204
9205 sub standardize ($) {
9206     # Returns a lower-cased standardized name, without underscores.  This form
9207     # is chosen so that it can distinguish between any real versus superficial
9208     # Unicode name differences.  It relies on the fact that Unicode doesn't
9209     # have interior underscores, white space, nor dashes in any
9210     # stricter-matched name.  It should not be used on Unicode code point
9211     # names (the Name property), as they mostly, but not always follow these
9212     # rules.
9213
9214     my $name = Standardize(shift);
9215     return if !defined $name;
9216
9217     $name =~ s/ (?<= .) _ (?= . ) //xg;
9218     return lc $name;
9219 }
9220
9221 sub utf8_heavy_name ($$) {
9222     # Returns the name that utf8_heavy.pl will use to find a table.  XXX
9223     # perhaps this function should be placed somewhere, like Heavy.pl so that
9224     # utf8_heavy can use it directly without duplicating code that can get
9225     # out-of sync.
9226
9227     my $table = shift;
9228     my $alias = shift;
9229     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9230
9231     my $property = $table->property;
9232     $property = ($property == $perl)
9233                 ? ""                # 'perl' is never explicitly stated
9234                 : standardize($property->name) . '=';
9235     if ($alias->loose_match) {
9236         return $property . standardize($alias->name);
9237     }
9238     else {
9239         return lc ($property . $alias->name);
9240     }
9241
9242     return;
9243 }
9244
9245 {   # Closure
9246
9247     my $indent_increment = " " x (($debugging_build) ? 2 : 0);
9248     my %already_output;
9249
9250     $main::simple_dumper_nesting = 0;
9251
9252     sub simple_dumper {
9253         # Like Simple Data::Dumper. Good enough for our needs. We can't use
9254         # the real thing as we have to run under miniperl.
9255
9256         # It is designed so that on input it is at the beginning of a line,
9257         # and the final thing output in any call is a trailing ",\n".
9258
9259         my $item = shift;
9260         my $indent = shift;
9261         $indent = "" if ! $debugging_build || ! defined $indent;
9262
9263         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9264
9265         # nesting level is localized, so that as the call stack pops, it goes
9266         # back to the prior value.
9267         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
9268         undef %already_output if $main::simple_dumper_nesting == 0;
9269         $main::simple_dumper_nesting++;
9270         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
9271
9272         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9273
9274         # Determine the indent for recursive calls.
9275         my $next_indent = $indent . $indent_increment;
9276
9277         my $output;
9278         if (! ref $item) {
9279
9280             # Dump of scalar: just output it in quotes if not a number.  To do
9281             # so we must escape certain characters, and therefore need to
9282             # operate on a copy to avoid changing the original
9283             my $copy = $item;
9284             $copy = $UNDEF unless defined $copy;
9285
9286             # Quote non-integers (integers also have optional leading '-')
9287             if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
9288
9289                 # Escape apostrophe and backslash
9290                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
9291                 $copy = "'$copy'";
9292             }
9293             $output = "$indent$copy,\n";
9294         }
9295         else {
9296
9297             # Keep track of cycles in the input, and refuse to infinitely loop
9298             my $addr = do { no overloading; pack 'J', $item; };
9299             if (defined $already_output{$addr}) {
9300                 return "${indent}ALREADY OUTPUT: $item\n";
9301             }
9302             $already_output{$addr} = $item;
9303
9304             if (ref $item eq 'ARRAY') {
9305                 my $using_brackets;
9306                 $output = $indent;
9307                 if ($main::simple_dumper_nesting > 1) {
9308                     $output .= '[';
9309                     $using_brackets = 1;
9310                 }
9311                 else {
9312                     $using_brackets = 0;
9313                 }
9314
9315                 # If the array is empty, put the closing bracket on the same
9316                 # line.  Otherwise, recursively add each array element
9317                 if (@$item == 0) {
9318                     $output .= " ";
9319                 }
9320                 else {
9321                     $output .= "\n";
9322                     for (my $i = 0; $i < @$item; $i++) {
9323
9324                         # Indent array elements one level
9325                         $output .= &simple_dumper($item->[$i], $next_indent);
9326                         next if ! $debugging_build;
9327                         $output =~ s/\n$//;      # Remove any trailing nl so
9328                         $output .= " # [$i]\n";  # as to add a comment giving
9329                                                  # the array index
9330                     }
9331                     $output .= $indent;     # Indent closing ']' to orig level
9332                 }
9333                 $output .= ']' if $using_brackets;
9334                 $output .= ",\n";
9335             }
9336             elsif (ref $item eq 'HASH') {
9337                 my $is_first_line;
9338                 my $using_braces;
9339                 my $body_indent;
9340
9341                 # No surrounding braces at top level
9342                 $output .= $indent;
9343                 if ($main::simple_dumper_nesting > 1) {
9344                     $output .= "{\n";
9345                     $is_first_line = 0;
9346                     $body_indent = $next_indent;
9347                     $next_indent .= $indent_increment;
9348                     $using_braces = 1;
9349                 }
9350                 else {
9351                     $is_first_line = 1;
9352                     $body_indent = $indent;
9353                     $using_braces = 0;
9354                 }
9355
9356                 # Output hashes sorted alphabetically instead of apparently
9357                 # random.  Use caseless alphabetic sort
9358                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
9359                 {
9360                     if ($is_first_line) {
9361                         $is_first_line = 0;
9362                     }
9363                     else {
9364                         $output .= "$body_indent";
9365                     }
9366
9367                     # The key must be a scalar, but this recursive call quotes
9368                     # it
9369                     $output .= &simple_dumper($key);
9370
9371                     # And change the trailing comma and nl to the hash fat
9372                     # comma for clarity, and so the value can be on the same
9373                     # line
9374                     $output =~ s/,\n$/ => /;
9375
9376                     # Recursively call to get the value's dump.
9377                     my $next = &simple_dumper($item->{$key}, $next_indent);
9378
9379                     # If the value is all on one line, remove its indent, so
9380                     # will follow the => immediately.  If it takes more than
9381                     # one line, start it on a new line.
9382                     if ($next !~ /\n.*\n/) {
9383                         $next =~ s/^ *//;
9384                     }
9385                     else {
9386                         $output .= "\n";
9387                     }
9388                     $output .= $next;
9389                 }
9390
9391                 $output .= "$indent},\n" if $using_braces;
9392             }
9393             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
9394                 $output = $indent . ref($item) . "\n";
9395                 # XXX see if blessed
9396             }
9397             elsif ($item->can('dump')) {
9398
9399                 # By convention in this program, objects furnish a 'dump'
9400                 # method.  Since not doing any output at this level, just pass
9401                 # on the input indent
9402                 $output = $item->dump($indent);
9403             }
9404             else {
9405                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
9406             }
9407         }
9408         return $output;
9409     }
9410 }
9411
9412 sub dump_inside_out {
9413     # Dump inside-out hashes in an object's state by converting them to a
9414     # regular hash and then calling simple_dumper on that.
9415
9416     my $object = shift;
9417     my $fields_ref = shift;
9418     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9419
9420     my $addr = do { no overloading; pack 'J', $object; };
9421
9422     my %hash;
9423     foreach my $key (keys %$fields_ref) {
9424         $hash{$key} = $fields_ref->{$key}{$addr};
9425     }
9426
9427     return simple_dumper(\%hash, @_);
9428 }
9429
9430 sub _operator_dot {
9431     # Overloaded '.' method that is common to all packages.  It uses the
9432     # package's stringify method.
9433
9434     my $self = shift;
9435     my $other = shift;
9436     my $reversed = shift;
9437     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9438
9439     $other = "" unless defined $other;
9440
9441     foreach my $which (\$self, \$other) {
9442         next unless ref $$which;
9443         if ($$which->can('_operator_stringify')) {
9444             $$which = $$which->_operator_stringify;
9445         }
9446         else {
9447             my $ref = ref $$which;
9448             my $addr = do { no overloading; pack 'J', $$which; };
9449             $$which = "$ref ($addr)";
9450         }
9451     }
9452     return ($reversed)
9453             ? "$other$self"
9454             : "$self$other";
9455 }
9456
9457 sub _operator_dot_equal {
9458     # Overloaded '.=' method that is common to all packages.
9459
9460     my $self = shift;
9461     my $other = shift;
9462     my $reversed = shift;
9463     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9464
9465     $other = "" unless defined $other;
9466
9467     if ($reversed) {
9468         return $other .= "$self";
9469     }
9470     else {
9471         return "$self" . "$other";
9472     }
9473 }
9474
9475 sub _operator_equal {
9476     # Generic overloaded '==' routine.  To be equal, they must be the exact
9477     # same object
9478
9479     my $self = shift;
9480     my $other = shift;
9481
9482     return 0 unless defined $other;
9483     return 0 unless ref $other;
9484     no overloading;
9485     return $self == $other;
9486 }
9487
9488 sub _operator_not_equal {
9489     my $self = shift;
9490     my $other = shift;
9491
9492     return ! _operator_equal($self, $other);
9493 }
9494
9495 sub process_PropertyAliases($) {
9496     # This reads in the PropertyAliases.txt file, which contains almost all
9497     # the character properties in Unicode and their equivalent aliases:
9498     # scf       ; Simple_Case_Folding         ; sfc
9499     #
9500     # Field 0 is the preferred short name for the property.
9501     # Field 1 is the full name.
9502     # Any succeeding ones are other accepted names.
9503
9504     my $file= shift;
9505     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9506
9507     # This whole file was non-existent in early releases, so use our own
9508     # internal one.
9509     $file->insert_lines(get_old_property_aliases())
9510                                                 if ! -e 'PropertyAliases.txt';
9511
9512     # Add any cjk properties that may have been defined.
9513     $file->insert_lines(@cjk_properties);
9514
9515     while ($file->next_line) {
9516
9517         my @data = split /\s*;\s*/;
9518
9519         my $full = $data[1];
9520
9521         my $this = Property->new($data[0], Full_Name => $full);
9522
9523         # Start looking for more aliases after these two.
9524         for my $i (2 .. @data - 1) {
9525             $this->add_alias($data[$i]);
9526         }
9527
9528     }
9529
9530     my $scf = property_ref("Simple_Case_Folding");
9531     $scf->add_alias("scf");
9532     $scf->add_alias("sfc");
9533
9534     return;
9535 }
9536
9537 sub finish_property_setup {
9538     # Finishes setting up after PropertyAliases.
9539
9540     my $file = shift;
9541     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9542
9543     # This entry was missing from this file in earlier Unicode versions
9544     if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
9545         Property->new('JSN', Full_Name => 'Jamo_Short_Name');
9546     }
9547
9548     # These two properties must be defined in all releases so we can generate
9549     # the tables from them to make regex \X work, but suppress their output so
9550     # aren't application visible prior to releases where they should be
9551     if (! defined property_ref('GCB')) {
9552         Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
9553                       Fate => $PLACEHOLDER);
9554     }
9555     if (! defined property_ref('hst')) {
9556         Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
9557                       Fate => $PLACEHOLDER);
9558     }
9559
9560     # These are used so much, that we set globals for them.
9561     $gc = property_ref('General_Category');
9562     $block = property_ref('Block');
9563     $script = property_ref('Script');
9564
9565     # Perl adds this alias.
9566     $gc->add_alias('Category');
9567
9568     # Unicode::Normalize expects this file with this name and directory.
9569     $ccc = property_ref('Canonical_Combining_Class');
9570     if (defined $ccc) {
9571         $ccc->set_file('CombiningClass');
9572         $ccc->set_directory(File::Spec->curdir());
9573     }
9574
9575     # These two properties aren't actually used in the core, but unfortunately
9576     # the names just above that are in the core interfere with these, so
9577     # choose different names.  These aren't a problem unless the map tables
9578     # for these files get written out.
9579     my $lowercase = property_ref('Lowercase');
9580     $lowercase->set_file('IsLower') if defined $lowercase;
9581     my $uppercase = property_ref('Uppercase');
9582     $uppercase->set_file('IsUpper') if defined $uppercase;
9583
9584     # Set up the hard-coded default mappings, but only on properties defined
9585     # for this release
9586     foreach my $property (keys %default_mapping) {
9587         my $property_object = property_ref($property);
9588         next if ! defined $property_object;
9589         my $default_map = $default_mapping{$property};
9590         $property_object->set_default_map($default_map);
9591
9592         # A map of <code point> implies the property is string.
9593         if ($property_object->type == $UNKNOWN
9594             && $default_map eq $CODE_POINT)
9595         {
9596             $property_object->set_type($STRING);
9597         }
9598     }
9599
9600     # The following use the Multi_Default class to create objects for
9601     # defaults.
9602
9603     # Bidi class has a complicated default, but the derived file takes care of
9604     # the complications, leaving just 'L'.
9605     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
9606         property_ref('Bidi_Class')->set_default_map('L');
9607     }
9608     else {
9609         my $default;
9610
9611         # The derived file was introduced in 3.1.1.  The values below are
9612         # taken from table 3-8, TUS 3.0
9613         my $default_R =
9614             'my $default = Range_List->new;
9615              $default->add_range(0x0590, 0x05FF);
9616              $default->add_range(0xFB1D, 0xFB4F);'
9617         ;
9618
9619         # The defaults apply only to unassigned characters
9620         $default_R .= '$gc->table("Unassigned") & $default;';
9621
9622         if ($v_version lt v3.0.0) {
9623             $default = Multi_Default->new(R => $default_R, 'L');
9624         }
9625         else {
9626
9627             # AL apparently not introduced until 3.0:  TUS 2.x references are
9628             # not on-line to check it out
9629             my $default_AL =
9630                 'my $default = Range_List->new;
9631                  $default->add_range(0x0600, 0x07BF);
9632                  $default->add_range(0xFB50, 0xFDFF);
9633                  $default->add_range(0xFE70, 0xFEFF);'
9634             ;
9635
9636             # Non-character code points introduced in this release; aren't AL
9637             if ($v_version ge 3.1.0) {
9638                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
9639             }
9640             $default_AL .= '$gc->table("Unassigned") & $default';
9641             $default = Multi_Default->new(AL => $default_AL,
9642                                           R => $default_R,
9643                                           'L');
9644         }
9645         property_ref('Bidi_Class')->set_default_map($default);
9646     }
9647
9648     # Joining type has a complicated default, but the derived file takes care
9649     # of the complications, leaving just 'U' (or Non_Joining), except the file
9650     # is bad in 3.1.0
9651     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
9652         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
9653             property_ref('Joining_Type')->set_default_map('Non_Joining');
9654         }
9655         else {
9656
9657             # Otherwise, there are not one, but two possibilities for the
9658             # missing defaults: T and U.
9659             # The missing defaults that evaluate to T are given by:
9660             # T = Mn + Cf - ZWNJ - ZWJ
9661             # where Mn and Cf are the general category values. In other words,
9662             # any non-spacing mark or any format control character, except
9663             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
9664             # WIDTH JOINER (joining type C).
9665             my $default = Multi_Default->new(
9666                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
9667                'Non_Joining');
9668             property_ref('Joining_Type')->set_default_map($default);
9669         }
9670     }
9671
9672     # Line break has a complicated default in early releases. It is 'Unknown'
9673     # for non-assigned code points; 'AL' for assigned.
9674     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
9675         my $lb = property_ref('Line_Break');
9676         if ($v_version gt 3.2.0) {
9677             $lb->set_default_map('Unknown');
9678         }
9679         else {
9680             my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
9681                                               'AL');
9682             $lb->set_default_map($default);
9683         }
9684
9685         # If has the URS property, make sure that the standard aliases are in
9686         # it, since not in the input tables in some versions.
9687         my $urs = property_ref('Unicode_Radical_Stroke');
9688         if (defined $urs) {
9689             $urs->add_alias('cjkRSUnicode');
9690             $urs->add_alias('kRSUnicode');
9691         }
9692     }
9693
9694     # For backwards compatibility with applications that may read the mapping
9695     # file directly (it was documented in 5.12 and 5.14 as being thusly
9696     # usable), keep it from being adjusted.  (range_size_1 is
9697     # used to force the traditional format.)
9698     if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
9699         $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
9700         $nfkc_cf->set_range_size_1(1);
9701     }
9702     if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
9703         $bmg->set_to_output_map($EXTERNAL_MAP);
9704         $bmg->set_range_size_1(1);
9705     }
9706
9707     property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
9708
9709     return;
9710 }
9711
9712 sub get_old_property_aliases() {
9713     # Returns what would be in PropertyAliases.txt if it existed in very old
9714     # versions of Unicode.  It was derived from the one in 3.2, and pared
9715     # down based on the data that was actually in the older releases.
9716     # An attempt was made to use the existence of files to mean inclusion or
9717     # not of various aliases, but if this was not sufficient, using version
9718     # numbers was resorted to.
9719
9720     my @return;
9721
9722     # These are to be used in all versions (though some are constructed by
9723     # this program if missing)
9724     push @return, split /\n/, <<'END';
9725 bc        ; Bidi_Class
9726 Bidi_M    ; Bidi_Mirrored
9727 cf        ; Case_Folding
9728 ccc       ; Canonical_Combining_Class
9729 dm        ; Decomposition_Mapping
9730 dt        ; Decomposition_Type
9731 gc        ; General_Category
9732 isc       ; ISO_Comment
9733 lc        ; Lowercase_Mapping
9734 na        ; Name
9735 na1       ; Unicode_1_Name
9736 nt        ; Numeric_Type
9737 nv        ; Numeric_Value
9738 scf       ; Simple_Case_Folding
9739 slc       ; Simple_Lowercase_Mapping
9740 stc       ; Simple_Titlecase_Mapping
9741 suc       ; Simple_Uppercase_Mapping
9742 tc        ; Titlecase_Mapping
9743 uc        ; Uppercase_Mapping
9744 END
9745
9746     if (-e 'Blocks.txt') {
9747         push @return, "blk       ; Block\n";
9748     }
9749     if (-e 'ArabicShaping.txt') {
9750         push @return, split /\n/, <<'END';
9751 jg        ; Joining_Group
9752 jt        ; Joining_Type
9753 END
9754     }
9755     if (-e 'PropList.txt') {
9756
9757         # This first set is in the original old-style proplist.
9758         push @return, split /\n/, <<'END';
9759 Bidi_C    ; Bidi_Control
9760 Dash      ; Dash
9761 Dia       ; Diacritic
9762 Ext       ; Extender
9763 Hex       ; Hex_Digit
9764 Hyphen    ; Hyphen
9765 IDC       ; ID_Continue
9766 Ideo      ; Ideographic
9767 Join_C    ; Join_Control
9768 Math      ; Math
9769 QMark     ; Quotation_Mark
9770 Term      ; Terminal_Punctuation
9771 WSpace    ; White_Space
9772 END
9773         # The next sets were added later
9774         if ($v_version ge v3.0.0) {
9775             push @return, split /\n/, <<'END';
9776 Upper     ; Uppercase
9777 Lower     ; Lowercase
9778 END
9779         }
9780         if ($v_version ge v3.0.1) {
9781             push @return, split /\n/, <<'END';
9782 NChar     ; Noncharacter_Code_Point
9783 END
9784         }
9785         # The next sets were added in the new-style
9786         if ($v_version ge v3.1.0) {
9787             push @return, split /\n/, <<'END';
9788 OAlpha    ; Other_Alphabetic
9789 OLower    ; Other_Lowercase
9790 OMath     ; Other_Math
9791 OUpper    ; Other_Uppercase
9792 END
9793         }
9794         if ($v_version ge v3.1.1) {
9795             push @return, "AHex      ; ASCII_Hex_Digit\n";
9796         }
9797     }
9798     if (-e 'EastAsianWidth.txt') {
9799         push @return, "ea        ; East_Asian_Width\n";
9800     }
9801     if (-e 'CompositionExclusions.txt') {
9802         push @return, "CE        ; Composition_Exclusion\n";
9803     }
9804     if (-e 'LineBreak.txt') {
9805         push @return, "lb        ; Line_Break\n";
9806     }
9807     if (-e 'BidiMirroring.txt') {
9808         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
9809     }
9810     if (-e 'Scripts.txt') {
9811         push @return, "sc        ; Script\n";
9812     }
9813     if (-e 'DNormalizationProps.txt') {
9814         push @return, split /\n/, <<'END';
9815 Comp_Ex   ; Full_Composition_Exclusion
9816 FC_NFKC   ; FC_NFKC_Closure
9817 NFC_QC    ; NFC_Quick_Check
9818 NFD_QC    ; NFD_Quick_Check
9819 NFKC_QC   ; NFKC_Quick_Check
9820 NFKD_QC   ; NFKD_Quick_Check
9821 XO_NFC    ; Expands_On_NFC
9822 XO_NFD    ; Expands_On_NFD
9823 XO_NFKC   ; Expands_On_NFKC
9824 XO_NFKD   ; Expands_On_NFKD
9825 END
9826     }
9827     if (-e 'DCoreProperties.txt') {
9828         push @return, split /\n/, <<'END';
9829 Alpha     ; Alphabetic
9830 IDS       ; ID_Start
9831 XIDC      ; XID_Continue
9832 XIDS      ; XID_Start
9833 END
9834         # These can also appear in some versions of PropList.txt
9835         push @return, "Lower     ; Lowercase\n"
9836                                     unless grep { $_ =~ /^Lower\b/} @return;
9837         push @return, "Upper     ; Uppercase\n"
9838                                     unless grep { $_ =~ /^Upper\b/} @return;
9839     }
9840
9841     # This flag requires the DAge.txt file to be copied into the directory.
9842     if (DEBUG && $compare_versions) {
9843         push @return, 'age       ; Age';
9844     }
9845
9846     return @return;
9847 }
9848
9849 sub process_PropValueAliases {
9850     # This file contains values that properties look like:
9851     # bc ; AL        ; Arabic_Letter
9852     # blk; n/a       ; Greek_And_Coptic                 ; Greek
9853     #
9854     # Field 0 is the property.
9855     # Field 1 is the short name of a property value or 'n/a' if no
9856     #                short name exists;
9857     # Field 2 is the full property value name;
9858     # Any other fields are more synonyms for the property value.
9859     # Purely numeric property values are omitted from the file; as are some
9860     # others, fewer and fewer in later releases
9861
9862     # Entries for the ccc property have an extra field before the
9863     # abbreviation:
9864     # ccc;   0; NR   ; Not_Reordered
9865     # It is the numeric value that the names are synonyms for.
9866
9867     # There are comment entries for values missing from this file:
9868     # # @missing: 0000..10FFFF; ISO_Comment; <none>
9869     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
9870
9871     my $file= shift;
9872     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9873
9874     # This whole file was non-existent in early releases, so use our own
9875     # internal one if necessary.
9876     if (! -e 'PropValueAliases.txt') {
9877         $file->insert_lines(get_old_property_value_aliases());
9878     }
9879
9880     if ($v_version lt 4.0.0) {
9881         $file->insert_lines(split /\n/, <<'END'
9882 hst; L                                ; Leading_Jamo
9883 hst; LV                               ; LV_Syllable
9884 hst; LVT                              ; LVT_Syllable
9885 hst; NA                               ; Not_Applicable
9886 hst; T                                ; Trailing_Jamo
9887 hst; V                                ; Vowel_Jamo
9888 END
9889         );
9890     }
9891     if ($v_version lt 4.1.0) {
9892         $file->insert_lines(split /\n/, <<'END'
9893 GCB; CN                               ; Control
9894 GCB; CR                               ; CR
9895 GCB; EX                               ; Extend
9896 GCB; L                                ; L
9897 GCB; LF                               ; LF
9898 GCB; LV                               ; LV
9899 GCB; LVT                              ; LVT
9900 GCB; T                                ; T
9901 GCB; V                                ; V
9902 GCB; XX                               ; Other
9903 END
9904         );
9905     }
9906
9907
9908     # Add any explicit cjk values
9909     $file->insert_lines(@cjk_property_values);
9910
9911     # This line is used only for testing the code that checks for name
9912     # conflicts.  There is a script Inherited, and when this line is executed
9913     # it causes there to be a name conflict with the 'Inherited' that this
9914     # program generates for this block property value
9915     #$file->insert_lines('blk; n/a; Herited');
9916
9917
9918     # Process each line of the file ...
9919     while ($file->next_line) {
9920
9921         # Fix typo in input file
9922         s/CCC133/CCC132/g if $v_version eq v6.1.0;
9923
9924         my ($property, @data) = split /\s*;\s*/;
9925
9926         # The ccc property has an extra field at the beginning, which is the
9927         # numeric value.  Move it to be after the other two, mnemonic, fields,
9928         # so that those will be used as the property value's names, and the
9929         # number will be an extra alias.  (Rightmost splice removes field 1-2,
9930         # returning them in a slice; left splice inserts that before anything,
9931         # thus shifting the former field 0 to after them.)
9932         splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
9933
9934         # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
9935         # there is no short name, use the full one in element 1
9936         if ($data[0] eq "n/a") {
9937             $data[0] = $data[1];
9938         }
9939         elsif ($data[0] ne $data[1]
9940                && standardize($data[0]) eq standardize($data[1])
9941                && $data[1] !~ /[[:upper:]]/)
9942         {
9943             # Also, there is a bug in the file in which "n/a" is omitted, and
9944             # the two fields are identical except for case, and the full name
9945             # is all lower case.  Copy the "short" name unto the full one to
9946             # give it some upper case.
9947
9948             $data[1] = $data[0];
9949         }
9950
9951         # Earlier releases had the pseudo property 'qc' that should expand to
9952         # the ones that replace it below.
9953         if ($property eq 'qc') {
9954             if (lc $data[0] eq 'y') {
9955                 $file->insert_lines('NFC_QC; Y      ; Yes',
9956                                     'NFD_QC; Y      ; Yes',
9957                                     'NFKC_QC; Y     ; Yes',
9958                                     'NFKD_QC; Y     ; Yes',
9959                                     );
9960             }
9961             elsif (lc $data[0] eq 'n') {
9962                 $file->insert_lines('NFC_QC; N      ; No',
9963                                     'NFD_QC; N      ; No',
9964                                     'NFKC_QC; N     ; No',
9965                                     'NFKD_QC; N     ; No',
9966                                     );
9967             }
9968             elsif (lc $data[0] eq 'm') {
9969                 $file->insert_lines('NFC_QC; M      ; Maybe',
9970                                     'NFKC_QC; M     ; Maybe',
9971                                     );
9972             }
9973             else {
9974                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
9975             }
9976             next;
9977         }
9978
9979         # The first field is the short name, 2nd is the full one.
9980         my $property_object = property_ref($property);
9981         my $table = $property_object->add_match_table($data[0],
9982                                                 Full_Name => $data[1]);
9983
9984         # Start looking for more aliases after these two.
9985         for my $i (2 .. @data - 1) {
9986             $table->add_alias($data[$i]);
9987         }
9988     } # End of looping through the file
9989
9990     # As noted in the comments early in the program, it generates tables for
9991     # the default values for all releases, even those for which the concept
9992     # didn't exist at the time.  Here we add those if missing.
9993     my $age = property_ref('age');
9994     if (defined $age && ! defined $age->table('Unassigned')) {
9995         $age->add_match_table('Unassigned');
9996     }
9997     $block->add_match_table('No_Block') if -e 'Blocks.txt'
9998                                     && ! defined $block->table('No_Block');
9999
10000
10001     # Now set the default mappings of the properties from the file.  This is
10002     # done after the loop because a number of properties have only @missings
10003     # entries in the file, and may not show up until the end.
10004     my @defaults = $file->get_missings;
10005     foreach my $default_ref (@defaults) {
10006         my $default = $default_ref->[0];
10007         my $property = property_ref($default_ref->[1]);
10008         $property->set_default_map($default);
10009     }
10010     return;
10011 }
10012
10013 sub get_old_property_value_aliases () {
10014     # Returns what would be in PropValueAliases.txt if it existed in very old
10015     # versions of Unicode.  It was derived from the one in 3.2, and pared
10016     # down.  An attempt was made to use the existence of files to mean
10017     # inclusion or not of various aliases, but if this was not sufficient,
10018     # using version numbers was resorted to.
10019
10020     my @return = split /\n/, <<'END';
10021 bc ; AN        ; Arabic_Number
10022 bc ; B         ; Paragraph_Separator
10023 bc ; CS        ; Common_Separator
10024 bc ; EN        ; European_Number
10025 bc ; ES        ; European_Separator
10026 bc ; ET        ; European_Terminator
10027 bc ; L         ; Left_To_Right
10028 bc ; ON        ; Other_Neutral
10029 bc ; R         ; Right_To_Left
10030 bc ; WS        ; White_Space
10031
10032 Bidi_M; N; No; F; False
10033 Bidi_M; Y; Yes; T; True
10034
10035 # The standard combining classes are very much different in v1, so only use
10036 # ones that look right (not checked thoroughly)
10037 ccc;   0; NR   ; Not_Reordered
10038 ccc;   1; OV   ; Overlay
10039 ccc;   7; NK   ; Nukta
10040 ccc;   8; KV   ; Kana_Voicing
10041 ccc;   9; VR   ; Virama
10042 ccc; 202; ATBL ; Attached_Below_Left
10043 ccc; 216; ATAR ; Attached_Above_Right
10044 ccc; 218; BL   ; Below_Left
10045 ccc; 220; B    ; Below
10046 ccc; 222; BR   ; Below_Right
10047 ccc; 224; L    ; Left
10048 ccc; 228; AL   ; Above_Left
10049 ccc; 230; A    ; Above
10050 ccc; 232; AR   ; Above_Right
10051 ccc; 234; DA   ; Double_Above
10052
10053 dt ; can       ; canonical
10054 dt ; enc       ; circle
10055 dt ; fin       ; final
10056 dt ; font      ; font
10057 dt ; fra       ; fraction
10058 dt ; init      ; initial
10059 dt ; iso       ; isolated
10060 dt ; med       ; medial
10061 dt ; n/a       ; none
10062 dt ; nb        ; noBreak
10063 dt ; sqr       ; square
10064 dt ; sub       ; sub
10065 dt ; sup       ; super
10066
10067 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
10068 gc ; Cc        ; Control
10069 gc ; Cn        ; Unassigned
10070 gc ; Co        ; Private_Use
10071 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
10072 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
10073 gc ; Ll        ; Lowercase_Letter
10074 gc ; Lm        ; Modifier_Letter
10075 gc ; Lo        ; Other_Letter
10076 gc ; Lu        ; Uppercase_Letter
10077 gc ; M         ; Mark                             # Mc | Me | Mn
10078 gc ; Mc        ; Spacing_Mark
10079 gc ; Mn        ; Nonspacing_Mark
10080 gc ; N         ; Number                           # Nd | Nl | No
10081 gc ; Nd        ; Decimal_Number
10082 gc ; No        ; Other_Number
10083 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
10084 gc ; Pd        ; Dash_Punctuation
10085 gc ; Pe        ; Close_Punctuation
10086 gc ; Po        ; Other_Punctuation
10087 gc ; Ps        ; Open_Punctuation
10088 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
10089 gc ; Sc        ; Currency_Symbol
10090 gc ; Sm        ; Math_Symbol
10091 gc ; So        ; Other_Symbol
10092 gc ; Z         ; Separator                        # Zl | Zp | Zs
10093 gc ; Zl        ; Line_Separator
10094 gc ; Zp        ; Paragraph_Separator
10095 gc ; Zs        ; Space_Separator
10096
10097 nt ; de        ; Decimal
10098 nt ; di        ; Digit
10099 nt ; n/a       ; None
10100 nt ; nu        ; Numeric
10101 END
10102
10103     if (-e 'ArabicShaping.txt') {
10104         push @return, split /\n/, <<'END';
10105 jg ; n/a       ; AIN
10106 jg ; n/a       ; ALEF
10107 jg ; n/a       ; DAL
10108 jg ; n/a       ; GAF
10109 jg ; n/a       ; LAM
10110 jg ; n/a       ; MEEM
10111 jg ; n/a       ; NO_JOINING_GROUP
10112 jg ; n/a       ; NOON
10113 jg ; n/a       ; QAF
10114 jg ; n/a       ; SAD
10115 jg ; n/a       ; SEEN
10116 jg ; n/a       ; TAH
10117 jg ; n/a       ; WAW
10118
10119 jt ; C         ; Join_Causing
10120 jt ; D         ; Dual_Joining
10121 jt ; L         ; Left_Joining
10122 jt ; R         ; Right_Joining
10123 jt ; U         ; Non_Joining
10124 jt ; T         ; Transparent
10125 END
10126         if ($v_version ge v3.0.0) {
10127             push @return, split /\n/, <<'END';
10128 jg ; n/a       ; ALAPH
10129 jg ; n/a       ; BEH
10130 jg ; n/a       ; BETH
10131 jg ; n/a       ; DALATH_RISH
10132 jg ; n/a       ; E
10133 jg ; n/a       ; FEH
10134 jg ; n/a       ; FINAL_SEMKATH
10135 jg ; n/a       ; GAMAL
10136 jg ; n/a       ; HAH
10137 jg ; n/a       ; HAMZA_ON_HEH_GOAL
10138 jg ; n/a       ; HE
10139 jg ; n/a       ; HEH
10140 jg ; n/a       ; HEH_GOAL
10141 jg ; n/a       ; HETH
10142 jg ; n/a       ; KAF
10143 jg ; n/a       ; KAPH
10144 jg ; n/a       ; KNOTTED_HEH
10145 jg ; n/a       ; LAMADH
10146 jg ; n/a       ; MIM
10147 jg ; n/a       ; NUN
10148 jg ; n/a       ; PE
10149 jg ; n/a       ; QAPH
10150 jg ; n/a       ; REH
10151 jg ; n/a       ; REVERSED_PE
10152 jg ; n/a       ; SADHE
10153 jg ; n/a       ; SEMKATH
10154 jg ; n/a       ; SHIN
10155 jg ; n/a       ; SWASH_KAF
10156 jg ; n/a       ; TAW
10157 jg ; n/a       ; TEH_MARBUTA
10158 jg ; n/a       ; TETH
10159 jg ; n/a       ; YEH
10160 jg ; n/a       ; YEH_BARREE
10161 jg ; n/a       ; YEH_WITH_TAIL
10162 jg ; n/a       ; YUDH
10163 jg ; n/a       ; YUDH_HE
10164 jg ; n/a       ; ZAIN
10165 END
10166         }
10167     }
10168
10169
10170     if (-e 'EastAsianWidth.txt') {
10171         push @return, split /\n/, <<'END';
10172 ea ; A         ; Ambiguous
10173 ea ; F         ; Fullwidth
10174 ea ; H         ; Halfwidth
10175 ea ; N         ; Neutral
10176 ea ; Na        ; Narrow
10177 ea ; W         ; Wide
10178 END
10179     }
10180
10181     if (-e 'LineBreak.txt') {
10182         push @return, split /\n/, <<'END';
10183 lb ; AI        ; Ambiguous
10184 lb ; AL        ; Alphabetic
10185 lb ; B2        ; Break_Both
10186 lb ; BA        ; Break_After
10187 lb ; BB        ; Break_Before
10188 lb ; BK        ; Mandatory_Break
10189 lb ; CB        ; Contingent_Break
10190 lb ; CL        ; Close_Punctuation
10191 lb ; CM        ; Combining_Mark
10192 lb ; CR        ; Carriage_Return
10193 lb ; EX        ; Exclamation
10194 lb ; GL        ; Glue
10195 lb ; HY        ; Hyphen
10196 lb ; ID        ; Ideographic
10197 lb ; IN        ; Inseperable
10198 lb ; IS        ; Infix_Numeric
10199 lb ; LF        ; Line_Feed
10200 lb ; NS        ; Nonstarter
10201 lb ; NU        ; Numeric
10202 lb ; OP        ; Open_Punctuation
10203 lb ; PO        ; Postfix_Numeric
10204 lb ; PR        ; Prefix_Numeric
10205 lb ; QU        ; Quotation
10206 lb ; SA        ; Complex_Context
10207 lb ; SG        ; Surrogate
10208 lb ; SP        ; Space
10209 lb ; SY        ; Break_Symbols
10210 lb ; XX        ; Unknown
10211 lb ; ZW        ; ZWSpace
10212 END
10213     }
10214
10215     if (-e 'DNormalizationProps.txt') {
10216         push @return, split /\n/, <<'END';
10217 qc ; M         ; Maybe
10218 qc ; N         ; No
10219 qc ; Y         ; Yes
10220 END
10221     }
10222
10223     if (-e 'Scripts.txt') {
10224         push @return, split /\n/, <<'END';
10225 sc ; Arab      ; Arabic
10226 sc ; Armn      ; Armenian
10227 sc ; Beng      ; Bengali
10228 sc ; Bopo      ; Bopomofo
10229 sc ; Cans      ; Canadian_Aboriginal
10230 sc ; Cher      ; Cherokee
10231 sc ; Cyrl      ; Cyrillic
10232 sc ; Deva      ; Devanagari
10233 sc ; Dsrt      ; Deseret
10234 sc ; Ethi      ; Ethiopic
10235 sc ; Geor      ; Georgian
10236 sc ; Goth      ; Gothic
10237 sc ; Grek      ; Greek
10238 sc ; Gujr      ; Gujarati
10239 sc ; Guru      ; Gurmukhi
10240 sc ; Hang      ; Hangul
10241 sc ; Hani      ; Han
10242 sc ; Hebr      ; Hebrew
10243 sc ; Hira      ; Hiragana
10244 sc ; Ital      ; Old_Italic
10245 sc ; Kana      ; Katakana
10246 sc ; Khmr      ; Khmer
10247 sc ; Knda      ; Kannada
10248 sc ; Laoo      ; Lao
10249 sc ; Latn      ; Latin
10250 sc ; Mlym      ; Malayalam
10251 sc ; Mong      ; Mongolian
10252 sc ; Mymr      ; Myanmar
10253 sc ; Ogam      ; Ogham
10254 sc ; Orya      ; Oriya
10255 sc ; Qaai      ; Inherited
10256 sc ; Runr      ; Runic
10257 sc ; Sinh      ; Sinhala
10258 sc ; Syrc      ; Syriac
10259 sc ; Taml      ; Tamil
10260 sc ; Telu      ; Telugu
10261 sc ; Thaa      ; Thaana
10262 sc ; Thai      ; Thai
10263 sc ; Tibt      ; Tibetan
10264 sc ; Yiii      ; Yi
10265 sc ; Zyyy      ; Common
10266 END
10267     }
10268
10269     if ($v_version ge v2.0.0) {
10270         push @return, split /\n/, <<'END';
10271 dt ; com       ; compat
10272 dt ; nar       ; narrow
10273 dt ; sml       ; small
10274 dt ; vert      ; vertical
10275 dt ; wide      ; wide
10276
10277 gc ; Cf        ; Format
10278 gc ; Cs        ; Surrogate
10279 gc ; Lt        ; Titlecase_Letter
10280 gc ; Me        ; Enclosing_Mark
10281 gc ; Nl        ; Letter_Number
10282 gc ; Pc        ; Connector_Punctuation
10283 gc ; Sk        ; Modifier_Symbol
10284 END
10285     }
10286     if ($v_version ge v2.1.2) {
10287         push @return, "bc ; S         ; Segment_Separator\n";
10288     }
10289     if ($v_version ge v2.1.5) {
10290         push @return, split /\n/, <<'END';
10291 gc ; Pf        ; Final_Punctuation
10292 gc ; Pi        ; Initial_Punctuation
10293 END
10294     }
10295     if ($v_version ge v2.1.8) {
10296         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
10297     }
10298
10299     if ($v_version ge v3.0.0) {
10300         push @return, split /\n/, <<'END';
10301 bc ; AL        ; Arabic_Letter
10302 bc ; BN        ; Boundary_Neutral
10303 bc ; LRE       ; Left_To_Right_Embedding
10304 bc ; LRO       ; Left_To_Right_Override
10305 bc ; NSM       ; Nonspacing_Mark
10306 bc ; PDF       ; Pop_Directional_Format
10307 bc ; RLE       ; Right_To_Left_Embedding
10308 bc ; RLO       ; Right_To_Left_Override
10309
10310 ccc; 233; DB   ; Double_Below
10311 END
10312     }
10313
10314     if ($v_version ge v3.1.0) {
10315         push @return, "ccc; 226; R    ; Right\n";
10316     }
10317
10318     return @return;
10319 }
10320
10321 sub process_NormalizationsTest {
10322
10323     # Each line looks like:
10324     #      source code point; NFC; NFD; NFKC; NFKD
10325     # e.g.
10326     #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
10327
10328     my $file= shift;
10329     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10330
10331     # Process each line of the file ...
10332     while ($file->next_line) {
10333
10334         next if /^@/;
10335
10336         my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
10337
10338         foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
10339             $$var = pack "U0U*", map { hex } split " ", $$var;
10340             $$var =~ s/(\\)/$1$1/g;
10341         }
10342
10343         push @normalization_tests,
10344                 "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
10345     } # End of looping through the file
10346 }
10347
10348 sub output_perl_charnames_line ($$) {
10349
10350     # Output the entries in Perl_charnames specially, using 5 digits instead
10351     # of four.  This makes the entries a constant length, and simplifies
10352     # charnames.pm which this table is for.  Unicode can have 6 digit
10353     # ordinals, but they are all private use or noncharacters which do not
10354     # have names, so won't be in this table.
10355
10356     return sprintf "%05X\t%s\n", $_[0], $_[1];
10357 }
10358
10359 { # Closure
10360     # This is used to store the range list of all the code points usable when
10361     # the little used $compare_versions feature is enabled.
10362     my $compare_versions_range_list;
10363
10364     # These are constants to the $property_info hash in this subroutine, to
10365     # avoid using a quoted-string which might have a typo.
10366     my $TYPE  = 'type';
10367     my $DEFAULT_MAP = 'default_map';
10368     my $DEFAULT_TABLE = 'default_table';
10369     my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
10370     my $MISSINGS = 'missings';
10371
10372     sub process_generic_property_file {
10373         # This processes a file containing property mappings and puts them
10374         # into internal map tables.  It should be used to handle any property
10375         # files that have mappings from a code point or range thereof to
10376         # something else.  This means almost all the UCD .txt files.
10377         # each_line_handlers() should be set to adjust the lines of these
10378         # files, if necessary, to what this routine understands:
10379         #
10380         # 0374          ; NFD_QC; N
10381         # 003C..003E    ; Math
10382         #
10383         # the fields are: "codepoint-range ; property; map"
10384         #
10385         # meaning the codepoints in the range all have the value 'map' under
10386         # 'property'.
10387         # Beginning and trailing white space in each field are not significant.
10388         # Note there is not a trailing semi-colon in the above.  A trailing
10389         # semi-colon means the map is a null-string.  An omitted map, as
10390         # opposed to a null-string, is assumed to be 'Y', based on Unicode
10391         # table syntax.  (This could have been hidden from this routine by
10392         # doing it in the $file object, but that would require parsing of the
10393         # line there, so would have to parse it twice, or change the interface
10394         # to pass this an array.  So not done.)
10395         #
10396         # The map field may begin with a sequence of commands that apply to
10397         # this range.  Each such command begins and ends with $CMD_DELIM.
10398         # These are used to indicate, for example, that the mapping for a
10399         # range has a non-default type.
10400         #
10401         # This loops through the file, calling its next_line() method, and
10402         # then taking the map and adding it to the property's table.
10403         # Complications arise because any number of properties can be in the
10404         # file, in any order, interspersed in any way.  The first time a
10405         # property is seen, it gets information about that property and
10406         # caches it for quick retrieval later.  It also normalizes the maps
10407         # so that only one of many synonyms is stored.  The Unicode input
10408         # files do use some multiple synonyms.
10409
10410         my $file = shift;
10411         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10412
10413         my %property_info;               # To keep track of what properties
10414                                          # have already had entries in the
10415                                          # current file, and info about each,
10416                                          # so don't have to recompute.
10417         my $property_name;               # property currently being worked on
10418         my $property_type;               # and its type
10419         my $previous_property_name = ""; # name from last time through loop
10420         my $property_object;             # pointer to the current property's
10421                                          # object
10422         my $property_addr;               # the address of that object
10423         my $default_map;                 # the string that code points missing
10424                                          # from the file map to
10425         my $default_table;               # For non-string properties, a
10426                                          # reference to the match table that
10427                                          # will contain the list of code
10428                                          # points that map to $default_map.
10429
10430         # Get the next real non-comment line
10431         LINE:
10432         while ($file->next_line) {
10433
10434             # Default replacement type; means that if parts of the range have
10435             # already been stored in our tables, the new map overrides them if
10436             # they differ more than cosmetically
10437             my $replace = $IF_NOT_EQUIVALENT;
10438             my $map_type;            # Default type for the map of this range
10439
10440             #local $to_trace = 1 if main::DEBUG;
10441             trace $_ if main::DEBUG && $to_trace;
10442
10443             # Split the line into components
10444             my ($range, $property_name, $map, @remainder)
10445                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10446
10447             # If more or less on the line than we are expecting, warn and skip
10448             # the line
10449             if (@remainder) {
10450                 $file->carp_bad_line('Extra fields');
10451                 next LINE;
10452             }
10453             elsif ( ! defined $property_name) {
10454                 $file->carp_bad_line('Missing property');
10455                 next LINE;
10456             }
10457
10458             # Examine the range.
10459             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
10460             {
10461                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
10462                 next LINE;
10463             }
10464             my $low = hex $1;
10465             my $high = (defined $2) ? hex $2 : $low;
10466
10467             # For the very specialized case of comparing two Unicode
10468             # versions...
10469             if (DEBUG && $compare_versions) {
10470                 if ($property_name eq 'Age') {
10471
10472                     # Only allow code points at least as old as the version
10473                     # specified.
10474                     my $age = pack "C*", split(/\./, $map);        # v string
10475                     next LINE if $age gt $compare_versions;
10476                 }
10477                 else {
10478
10479                     # Again, we throw out code points younger than those of
10480                     # the specified version.  By now, the Age property is
10481                     # populated.  We use the intersection of each input range
10482                     # with this property to find what code points in it are
10483                     # valid.   To do the intersection, we have to convert the
10484                     # Age property map to a Range_list.  We only have to do
10485                     # this once.
10486                     if (! defined $compare_versions_range_list) {
10487                         my $age = property_ref('Age');
10488                         if (! -e 'DAge.txt') {
10489                             croak "Need to have 'DAge.txt' file to do version comparison";
10490                         }
10491                         elsif ($age->count == 0) {
10492                             croak "The 'Age' table is empty, but its file exists";
10493                         }
10494                         $compare_versions_range_list
10495                                         = Range_List->new(Initialize => $age);
10496                     }
10497
10498                     # An undefined map is always 'Y'
10499                     $map = 'Y' if ! defined $map;
10500
10501                     # Calculate the intersection of the input range with the
10502                     # code points that are known in the specified version
10503                     my @ranges = ($compare_versions_range_list
10504                                   & Range->new($low, $high))->ranges;
10505
10506                     # If the intersection is empty, throw away this range
10507                     next LINE unless @ranges;
10508
10509                     # Only examine the first range this time through the loop.
10510                     my $this_range = shift @ranges;
10511
10512                     # Put any remaining ranges in the queue to be processed
10513                     # later.  Note that there is unnecessary work here, as we
10514                     # will do the intersection again for each of these ranges
10515                     # during some future iteration of the LINE loop, but this
10516                     # code is not used in production.  The later intersections
10517                     # are guaranteed to not splinter, so this will not become
10518                     # an infinite loop.
10519                     my $line = join ';', $property_name, $map;
10520                     foreach my $range (@ranges) {
10521                         $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
10522                                                             $range->start,
10523                                                             $range->end,
10524                                                             $line));
10525                     }
10526
10527                     # And process the first range, like any other.
10528                     $low = $this_range->start;
10529                     $high = $this_range->end;
10530                 }
10531             } # End of $compare_versions
10532
10533             # If changing to a new property, get the things constant per
10534             # property
10535             if ($previous_property_name ne $property_name) {
10536
10537                 $property_object = property_ref($property_name);
10538                 if (! defined $property_object) {
10539                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
10540                     next LINE;
10541                 }
10542                 { no overloading; $property_addr = pack 'J', $property_object; }
10543
10544                 # Defer changing names until have a line that is acceptable
10545                 # (the 'next' statement above means is unacceptable)
10546                 $previous_property_name = $property_name;
10547
10548                 # If not the first time for this property, retrieve info about
10549                 # it from the cache
10550                 if (defined ($property_info{$property_addr}{$TYPE})) {
10551                     $property_type = $property_info{$property_addr}{$TYPE};
10552                     $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
10553                     $map_type
10554                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
10555                     $default_table
10556                             = $property_info{$property_addr}{$DEFAULT_TABLE};
10557                 }
10558                 else {
10559
10560                     # Here, is the first time for this property.  Set up the
10561                     # cache.
10562                     $property_type = $property_info{$property_addr}{$TYPE}
10563                                    = $property_object->type;
10564                     $map_type
10565                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
10566                         = $property_object->pseudo_map_type;
10567
10568                     # The Unicode files are set up so that if the map is not
10569                     # defined, it is a binary property
10570                     if (! defined $map && $property_type != $BINARY) {
10571                         if ($property_type != $UNKNOWN
10572                             && $property_type != $NON_STRING)
10573                         {
10574                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
10575                         }
10576                         else {
10577                             $property_object->set_type($BINARY);
10578                             $property_type
10579                                 = $property_info{$property_addr}{$TYPE}
10580                                 = $BINARY;
10581                         }
10582                     }
10583
10584                     # Get any @missings default for this property.  This
10585                     # should precede the first entry for the property in the
10586                     # input file, and is located in a comment that has been
10587                     # stored by the Input_file class until we access it here.
10588                     # It's possible that there is more than one such line
10589                     # waiting for us; collect them all, and parse
10590                     my @missings_list = $file->get_missings
10591                                             if $file->has_missings_defaults;
10592                     foreach my $default_ref (@missings_list) {
10593                         my $default = $default_ref->[0];
10594                         my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
10595
10596                         # For string properties, the default is just what the
10597                         # file says, but non-string properties should already
10598                         # have set up a table for the default property value;
10599                         # use the table for these, so can resolve synonyms
10600                         # later to a single standard one.
10601                         if ($property_type == $STRING
10602                             || $property_type == $UNKNOWN)
10603                         {
10604                             $property_info{$addr}{$MISSINGS} = $default;
10605                         }
10606                         else {
10607                             $property_info{$addr}{$MISSINGS}
10608                                         = $property_object->table($default);
10609                         }
10610                     }
10611
10612                     # Finished storing all the @missings defaults in the input
10613                     # file so far.  Get the one for the current property.
10614                     my $missings = $property_info{$property_addr}{$MISSINGS};
10615
10616                     # But we likely have separately stored what the default
10617                     # should be.  (This is to accommodate versions of the
10618                     # standard where the @missings lines are absent or
10619                     # incomplete.)  Hopefully the two will match.  But check
10620                     # it out.
10621                     $default_map = $property_object->default_map;
10622
10623                     # If the map is a ref, it means that the default won't be
10624                     # processed until later, so undef it, so next few lines
10625                     # will redefine it to something that nothing will match
10626                     undef $default_map if ref $default_map;
10627
10628                     # Create a $default_map if don't have one; maybe a dummy
10629                     # that won't match anything.
10630                     if (! defined $default_map) {
10631
10632                         # Use any @missings line in the file.
10633                         if (defined $missings) {
10634                             if (ref $missings) {
10635                                 $default_map = $missings->full_name;
10636                                 $default_table = $missings;
10637                             }
10638                             else {
10639                                 $default_map = $missings;
10640                             }
10641
10642                             # And store it with the property for outside use.
10643                             $property_object->set_default_map($default_map);
10644                         }
10645                         else {
10646
10647                             # Neither an @missings nor a default map.  Create
10648                             # a dummy one, so won't have to test definedness
10649                             # in the main loop.
10650                             $default_map = '_Perl This will never be in a file
10651                                             from Unicode';
10652                         }
10653                     }
10654
10655                     # Here, we have $default_map defined, possibly in terms of
10656                     # $missings, but maybe not, and possibly is a dummy one.
10657                     if (defined $missings) {
10658
10659                         # Make sure there is no conflict between the two.
10660                         # $missings has priority.
10661                         if (ref $missings) {
10662                             $default_table
10663                                         = $property_object->table($default_map);
10664                             if (! defined $default_table
10665                                 || $default_table != $missings)
10666                             {
10667                                 if (! defined $default_table) {
10668                                     $default_table = $UNDEF;
10669                                 }
10670                                 $file->carp_bad_line(<<END
10671 The \@missings line for $property_name in $file says that missings default to
10672 $missings, but we expect it to be $default_table.  $missings used.
10673 END
10674                                 );
10675                                 $default_table = $missings;
10676                                 $default_map = $missings->full_name;
10677                             }
10678                             $property_info{$property_addr}{$DEFAULT_TABLE}
10679                                                         = $default_table;
10680                         }
10681                         elsif ($default_map ne $missings) {
10682                             $file->carp_bad_line(<<END
10683 The \@missings line for $property_name in $file says that missings default to
10684 $missings, but we expect it to be $default_map.  $missings used.
10685 END
10686                             );
10687                             $default_map = $missings;
10688                         }
10689                     }
10690
10691                     $property_info{$property_addr}{$DEFAULT_MAP}
10692                                                     = $default_map;
10693
10694                     # If haven't done so already, find the table corresponding
10695                     # to this map for non-string properties.
10696                     if (! defined $default_table
10697                         && $property_type != $STRING
10698                         && $property_type != $UNKNOWN)
10699                     {
10700                         $default_table = $property_info{$property_addr}
10701                                                         {$DEFAULT_TABLE}
10702                                     = $property_object->table($default_map);
10703                     }
10704                 } # End of is first time for this property
10705             } # End of switching properties.
10706
10707             # Ready to process the line.
10708             # The Unicode files are set up so that if the map is not defined,
10709             # it is a binary property with value 'Y'
10710             if (! defined $map) {
10711                 $map = 'Y';
10712             }
10713             else {
10714
10715                 # If the map begins with a special command to us (enclosed in
10716                 # delimiters), extract the command(s).
10717                 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
10718                     my $command = $1;
10719                     if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
10720                         $replace = $1;
10721                     }
10722                     elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
10723                         $map_type = $1;
10724                     }
10725                     else {
10726                         $file->carp_bad_line("Unknown command line: '$1'");
10727                         next LINE;
10728                     }
10729                 }
10730             }
10731
10732             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
10733             {
10734
10735                 # Here, we have a map to a particular code point, and the
10736                 # default map is to a code point itself.  If the range
10737                 # includes the particular code point, change that portion of
10738                 # the range to the default.  This makes sure that in the final
10739                 # table only the non-defaults are listed.
10740                 my $decimal_map = hex $map;
10741                 if ($low <= $decimal_map && $decimal_map <= $high) {
10742
10743                     # If the range includes stuff before or after the map
10744                     # we're changing, split it and process the split-off parts
10745                     # later.
10746                     if ($low < $decimal_map) {
10747                         $file->insert_adjusted_lines(
10748                                             sprintf("%04X..%04X; %s; %s",
10749                                                     $low,
10750                                                     $decimal_map - 1,
10751                                                     $property_name,
10752                                                     $map));
10753                     }
10754                     if ($high > $decimal_map) {
10755                         $file->insert_adjusted_lines(
10756                                             sprintf("%04X..%04X; %s; %s",
10757                                                     $decimal_map + 1,
10758                                                     $high,
10759                                                     $property_name,
10760                                                     $map));
10761                     }
10762                     $low = $high = $decimal_map;
10763                     $map = $CODE_POINT;
10764                 }
10765             }
10766
10767             # If we can tell that this is a synonym for the default map, use
10768             # the default one instead.
10769             if ($property_type != $STRING
10770                 && $property_type != $UNKNOWN)
10771             {
10772                 my $table = $property_object->table($map);
10773                 if (defined $table && $table == $default_table) {
10774                     $map = $default_map;
10775                 }
10776             }
10777
10778             # And figure out the map type if not known.
10779             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
10780                 if ($map eq "") {   # Nulls are always $NULL map type
10781                     $map_type = $NULL;
10782                 } # Otherwise, non-strings, and those that don't allow
10783                   # $MULTI_CP, and those that aren't multiple code points are
10784                   # 0
10785                 elsif
10786                    (($property_type != $STRING && $property_type != $UNKNOWN)
10787                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
10788                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
10789                 {
10790                     $map_type = 0;
10791                 }
10792                 else {
10793                     $map_type = $MULTI_CP;
10794                 }
10795             }
10796
10797             $property_object->add_map($low, $high,
10798                                         $map,
10799                                         Type => $map_type,
10800                                         Replace => $replace);
10801         } # End of loop through file's lines
10802
10803         return;
10804     }
10805 }
10806
10807 { # Closure for UnicodeData.txt handling
10808
10809     # This file was the first one in the UCD; its design leads to some
10810     # awkwardness in processing.  Here is a sample line:
10811     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
10812     # The fields in order are:
10813     my $i = 0;            # The code point is in field 0, and is shifted off.
10814     my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
10815     my $CATEGORY = $i++;  # category (e.g. "Lu")
10816     my $CCC = $i++;       # Canonical combining class (e.g. "230")
10817     my $BIDI = $i++;      # directional class (e.g. "L")
10818     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
10819     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
10820     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
10821                                          # Dual-use in this program; see below
10822     my $NUMERIC = $i++;   # numeric value
10823     my $MIRRORED = $i++;  # ? mirrored
10824     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
10825     my $COMMENT = $i++;   # iso comment
10826     my $UPPER = $i++;     # simple uppercase mapping
10827     my $LOWER = $i++;     # simple lowercase mapping
10828     my $TITLE = $i++;     # simple titlecase mapping
10829     my $input_field_count = $i;
10830
10831     # This routine in addition outputs these extra fields:
10832
10833     my $DECOMP_TYPE = $i++; # Decomposition type
10834
10835     # These fields are modifications of ones above, and are usually
10836     # suppressed; they must come last, as for speed, the loop upper bound is
10837     # normally set to ignore them
10838     my $NAME = $i++;        # This is the strict name field, not the one that
10839                             # charnames uses.
10840     my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
10841                             # by Unicode::Normalize
10842     my $last_field = $i - 1;
10843
10844     # All these are read into an array for each line, with the indices defined
10845     # above.  The empty fields in the example line above indicate that the
10846     # value is defaulted.  The handler called for each line of the input
10847     # changes these to their defaults.
10848
10849     # Here are the official names of the properties, in a parallel array:
10850     my @field_names;
10851     $field_names[$BIDI] = 'Bidi_Class';
10852     $field_names[$CATEGORY] = 'General_Category';
10853     $field_names[$CCC] = 'Canonical_Combining_Class';
10854     $field_names[$CHARNAME] = 'Perl_Charnames';
10855     $field_names[$COMMENT] = 'ISO_Comment';
10856     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
10857     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
10858     $field_names[$LOWER] = 'Lowercase_Mapping';
10859     $field_names[$MIRRORED] = 'Bidi_Mirrored';
10860     $field_names[$NAME] = 'Name';
10861     $field_names[$NUMERIC] = 'Numeric_Value';
10862     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
10863     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
10864     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
10865     $field_names[$TITLE] = 'Titlecase_Mapping';
10866     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
10867     $field_names[$UPPER] = 'Uppercase_Mapping';
10868
10869     # Some of these need a little more explanation:
10870     # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
10871     #   property, but is used in calculating the Numeric_Type.  Perl however,
10872     #   creates a file from this field, so a Perl property is created from it.
10873     # Similarly, the Other_Digit field is used only for calculating the
10874     #   Numeric_Type, and so it can be safely re-used as the place to store
10875     #   the value for Numeric_Type; hence it is referred to as
10876     #   $NUMERIC_TYPE_OTHER_DIGIT.
10877     # The input field named $PERL_DECOMPOSITION is a combination of both the
10878     #   decomposition mapping and its type.  Perl creates a file containing
10879     #   exactly this field, so it is used for that.  The two properties are
10880     #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
10881     #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
10882     #   output it), as Perl doesn't use it directly.
10883     # The input field named here $CHARNAME is used to construct the
10884     #   Perl_Charnames property, which is a combination of the Name property
10885     #   (which the input field contains), and the Unicode_1_Name property, and
10886     #   others from other files.  Since, the strict Name property is not used
10887     #   by Perl, this field is used for the table that Perl does use.  The
10888     #   strict Name property table is usually suppressed (unless the lists are
10889     #   changed to output it), so it is accumulated in a separate field,
10890     #   $NAME, which to save time is discarded unless the table is actually to
10891     #   be output
10892
10893     # This file is processed like most in this program.  Control is passed to
10894     # process_generic_property_file() which calls filter_UnicodeData_line()
10895     # for each input line.  This filter converts the input into line(s) that
10896     # process_generic_property_file() understands.  There is also a setup
10897     # routine called before any of the file is processed, and a handler for
10898     # EOF processing, all in this closure.
10899
10900     # A huge speed-up occurred at the cost of some added complexity when these
10901     # routines were altered to buffer the outputs into ranges.  Almost all the
10902     # lines of the input file apply to just one code point, and for most
10903     # properties, the map for the next code point up is the same as the
10904     # current one.  So instead of creating a line for each property for each
10905     # input line, filter_UnicodeData_line() remembers what the previous map
10906     # of a property was, and doesn't generate a line to pass on until it has
10907     # to, as when the map changes; and that passed-on line encompasses the
10908     # whole contiguous range of code points that have the same map for that
10909     # property.  This means a slight amount of extra setup, and having to
10910     # flush these buffers on EOF, testing if the maps have changed, plus
10911     # remembering state information in the closure.  But it means a lot less
10912     # real time in not having to change the data base for each property on
10913     # each line.
10914
10915     # Another complication is that there are already a few ranges designated
10916     # in the input.  There are two lines for each, with the same maps except
10917     # the code point and name on each line.  This was actually the hardest
10918     # thing to design around.  The code points in those ranges may actually
10919     # have real maps not given by these two lines.  These maps will either
10920     # be algorithmically determinable, or be in the extracted files furnished
10921     # with the UCD.  In the event of conflicts between these extracted files,
10922     # and this one, Unicode says that this one prevails.  But it shouldn't
10923     # prevail for conflicts that occur in these ranges.  The data from the
10924     # extracted files prevails in those cases.  So, this program is structured
10925     # so that those files are processed first, storing maps.  Then the other
10926     # files are processed, generally overwriting what the extracted files
10927     # stored.  But just the range lines in this input file are processed
10928     # without overwriting.  This is accomplished by adding a special string to
10929     # the lines output to tell process_generic_property_file() to turn off the
10930     # overwriting for just this one line.
10931     # A similar mechanism is used to tell it that the map is of a non-default
10932     # type.
10933
10934     sub setup_UnicodeData { # Called before any lines of the input are read
10935         my $file = shift;
10936         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10937
10938         # Create a new property specially located that is a combination of the
10939         # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10940         # Name_Alias properties.  (The final duplicates elements of the
10941         # first.)  A comment for it will later be constructed based on the
10942         # actual properties present and used
10943         $perl_charname = Property->new('Perl_Charnames',
10944                        Default_Map => "",
10945                        Directory => File::Spec->curdir(),
10946                        File => 'Name',
10947                        Fate => $INTERNAL_ONLY,
10948                        Perl_Extension => 1,
10949                        Range_Size_1 => \&output_perl_charnames_line,
10950                        Type => $STRING,
10951                        );
10952         $perl_charname->set_proxy_for('Name');
10953
10954         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
10955                                         Directory => File::Spec->curdir(),
10956                                         File => 'Decomposition',
10957                                         Format => $DECOMP_STRING_FORMAT,
10958                                         Fate => $INTERNAL_ONLY,
10959                                         Perl_Extension => 1,
10960                                         Default_Map => $CODE_POINT,
10961
10962                                         # normalize.pm can't cope with these
10963                                         Output_Range_Counts => 0,
10964
10965                                         # This is a specially formatted table
10966                                         # explicitly for normalize.pm, which
10967                                         # is expecting a particular format,
10968                                         # which means that mappings containing
10969                                         # multiple code points are in the main
10970                                         # body of the table
10971                                         Map_Type => $COMPUTE_NO_MULTI_CP,
10972                                         Type => $STRING,
10973                                         To_Output_Map => $INTERNAL_MAP,
10974                                         );
10975         $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
10976         $Perl_decomp->add_comment(join_lines(<<END
10977 This mapping is a combination of the Unicode 'Decomposition_Type' and
10978 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
10979 identical to the official Unicode 'Decomposition_Mapping' property except for
10980 two things:
10981  1) It omits the algorithmically determinable Hangul syllable decompositions,
10982 which normalize.pm handles algorithmically.
10983  2) It contains the decomposition type as well.  Non-canonical decompositions
10984 begin with a word in angle brackets, like <super>, which denotes the
10985 compatible decomposition type.  If the map does not begin with the <angle
10986 brackets>, the decomposition is canonical.
10987 END
10988         ));
10989
10990         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
10991                                         Default_Map => "",
10992                                         Perl_Extension => 1,
10993                                         Directory => $map_directory,
10994                                         Type => $STRING,
10995                                         To_Output_Map => $OUTPUT_ADJUSTED,
10996                                         );
10997         $Decimal_Digit->add_comment(join_lines(<<END
10998 This file gives the mapping of all code points which represent a single
10999 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11000 points, and the mapping of each non-initial element of each range is actually
11001 not to "0", but to the offset that element has from its corresponding DIGIT 0.
11002 These code points are those that have Numeric_Type=Decimal; not special
11003 things, like subscripts nor Roman numerals.
11004 END
11005         ));
11006
11007         # These properties are not used for generating anything else, and are
11008         # usually not output.  By making them last in the list, we can just
11009         # change the high end of the loop downwards to avoid the work of
11010         # generating a table(s) that is/are just going to get thrown away.
11011         if (! property_ref('Decomposition_Mapping')->to_output_map
11012             && ! property_ref('Name')->to_output_map)
11013         {
11014             $last_field = min($NAME, $DECOMP_MAP) - 1;
11015         } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11016             $last_field = $DECOMP_MAP;
11017         } elsif (property_ref('Name')->to_output_map) {
11018             $last_field = $NAME;
11019         }
11020         return;
11021     }
11022
11023     my $first_time = 1;                 # ? Is this the first line of the file
11024     my $in_range = 0;                   # ? Are we in one of the file's ranges
11025     my $previous_cp;                    # hex code point of previous line
11026     my $decimal_previous_cp = -1;       # And its decimal equivalent
11027     my @start;                          # For each field, the current starting
11028                                         # code point in hex for the range
11029                                         # being accumulated.
11030     my @fields;                         # The input fields;
11031     my @previous_fields;                # And those from the previous call
11032
11033     sub filter_UnicodeData_line {
11034         # Handle a single input line from UnicodeData.txt; see comments above
11035         # Conceptually this takes a single line from the file containing N
11036         # properties, and converts it into N lines with one property per line,
11037         # which is what the final handler expects.  But there are
11038         # complications due to the quirkiness of the input file, and to save
11039         # time, it accumulates ranges where the property values don't change
11040         # and only emits lines when necessary.  This is about an order of
11041         # magnitude fewer lines emitted.
11042
11043         my $file = shift;
11044         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11045
11046         # $_ contains the input line.
11047         # -1 in split means retain trailing null fields
11048         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11049
11050         #local $to_trace = 1 if main::DEBUG;
11051         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11052         if (@fields > $input_field_count) {
11053             $file->carp_bad_line('Extra fields');
11054             $_ = "";
11055             return;
11056         }
11057
11058         my $decimal_cp = hex $cp;
11059
11060         # We have to output all the buffered ranges when the next code point
11061         # is not exactly one after the previous one, which means there is a
11062         # gap in the ranges.
11063         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11064
11065         # The decomposition mapping field requires special handling.  It looks
11066         # like either:
11067         #
11068         # <compat> 0032 0020
11069         # 0041 0300
11070         #
11071         # The decomposition type is enclosed in <brackets>; if missing, it
11072         # means the type is canonical.  There are two decomposition mapping
11073         # tables: the one for use by Perl's normalize.pm has a special format
11074         # which is this field intact; the other, for general use is of
11075         # standard format.  In either case we have to find the decomposition
11076         # type.  Empty fields have None as their type, and map to the code
11077         # point itself
11078         if ($fields[$PERL_DECOMPOSITION] eq "") {
11079             $fields[$DECOMP_TYPE] = 'None';
11080             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11081         }
11082         else {
11083             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11084                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
11085             if (! defined $fields[$DECOMP_TYPE]) {
11086                 $fields[$DECOMP_TYPE] = 'Canonical';
11087                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11088             }
11089             else {
11090                 $fields[$DECOMP_MAP] = $map;
11091             }
11092         }
11093
11094         # The 3 numeric fields also require special handling.  The 2 digit
11095         # fields must be either empty or match the number field.  This means
11096         # that if it is empty, they must be as well, and the numeric type is
11097         # None, and the numeric value is 'Nan'.
11098         # The decimal digit field must be empty or match the other digit
11099         # field.  If the decimal digit field is non-empty, the code point is
11100         # a decimal digit, and the other two fields will have the same value.
11101         # If it is empty, but the other digit field is non-empty, the code
11102         # point is an 'other digit', and the number field will have the same
11103         # value as the other digit field.  If the other digit field is empty,
11104         # but the number field is non-empty, the code point is a generic
11105         # numeric type.
11106         if ($fields[$NUMERIC] eq "") {
11107             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11108                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11109             ) {
11110                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
11111             }
11112             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11113             $fields[$NUMERIC] = 'NaN';
11114         }
11115         else {
11116             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11117             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11118                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11119                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11120                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11121             }
11122             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11123                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11124                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11125             }
11126             else {
11127                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11128
11129                 # Rationals require extra effort.
11130                 register_fraction($fields[$NUMERIC])
11131                                                 if $fields[$NUMERIC] =~ qr{/};
11132             }
11133         }
11134
11135         # For the properties that have empty fields in the file, and which
11136         # mean something different from empty, change them to that default.
11137         # Certain fields just haven't been empty so far in any Unicode
11138         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11139         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
11140         # the defaults; which are very unlikely to ever change.
11141         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11142         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11143
11144         # UAX44 says that if title is empty, it is the same as whatever upper
11145         # is,
11146         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11147
11148         # There are a few pairs of lines like:
11149         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11150         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11151         # that define ranges.  These should be processed after the fields are
11152         # adjusted above, as they may override some of them; but mostly what
11153         # is left is to possibly adjust the $CHARNAME field.  The names of all the
11154         # paired lines start with a '<', but this is also true of '<control>,
11155         # which isn't one of these special ones.
11156         if ($fields[$CHARNAME] eq '<control>') {
11157
11158             # Some code points in this file have the pseudo-name
11159             # '<control>', but the official name for such ones is the null
11160             # string.
11161             $fields[$NAME] = $fields[$CHARNAME] = "";
11162
11163             # We had better not be in between range lines.
11164             if ($in_range) {
11165                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11166                 $in_range = 0;
11167             }
11168         }
11169         elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11170
11171             # Here is a non-range line.  We had better not be in between range
11172             # lines.
11173             if ($in_range) {
11174                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11175                 $in_range = 0;
11176             }
11177             if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11178
11179                 # These are code points whose names end in their code points,
11180                 # which means the names are algorithmically derivable from the
11181                 # code points.  To shorten the output Name file, the algorithm
11182                 # for deriving these is placed in the file instead of each
11183                 # code point, so they have map type $CP_IN_NAME
11184                 $fields[$CHARNAME] = $CMD_DELIM
11185                                  . $MAP_TYPE_CMD
11186                                  . '='
11187                                  . $CP_IN_NAME
11188                                  . $CMD_DELIM
11189                                  . $fields[$CHARNAME];
11190             }
11191             $fields[$NAME] = $fields[$CHARNAME];
11192         }
11193         elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11194             $fields[$CHARNAME] = $fields[$NAME] = $1;
11195
11196             # Here we are at the beginning of a range pair.
11197             if ($in_range) {
11198                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
11199             }
11200             $in_range = 1;
11201
11202             # Because the properties in the range do not overwrite any already
11203             # in the db, we must flush the buffers of what's already there, so
11204             # they get handled in the normal scheme.
11205             $force_output = 1;
11206
11207         }
11208         elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11209             $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
11210             $_ = "";
11211             return;
11212         }
11213         else { # Here, we are at the last line of a range pair.
11214
11215             if (! $in_range) {
11216                 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
11217                 $_ = "";
11218                 return;
11219             }
11220             $in_range = 0;
11221
11222             $fields[$NAME] = $fields[$CHARNAME];
11223
11224             # Check that the input is valid: that the closing of the range is
11225             # the same as the beginning.
11226             foreach my $i (0 .. $last_field) {
11227                 next if $fields[$i] eq $previous_fields[$i];
11228                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
11229             }
11230
11231             # The processing differs depending on the type of range,
11232             # determined by its $CHARNAME
11233             if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
11234
11235                 # Check that the data looks right.
11236                 if ($decimal_previous_cp != $SBase) {
11237                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
11238                 }
11239                 if ($decimal_cp != $SBase + $SCount - 1) {
11240                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
11241                 }
11242
11243                 # The Hangul syllable range has a somewhat complicated name
11244                 # generation algorithm.  Each code point in it has a canonical
11245                 # decomposition also computable by an algorithm.  The
11246                 # perl decomposition map table built from these is used only
11247                 # by normalize.pm, which has the algorithm built in it, so the
11248                 # decomposition maps are not needed, and are large, so are
11249                 # omitted from it.  If the full decomposition map table is to
11250                 # be output, the decompositions are generated for it, in the
11251                 # EOF handling code for this input file.
11252
11253                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
11254
11255                 # This range is stored in our internal structure with its
11256                 # own map type, different from all others.
11257                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11258                                         = $CMD_DELIM
11259                                           . $MAP_TYPE_CMD
11260                                           . '='
11261                                           . $HANGUL_SYLLABLE
11262                                           . $CMD_DELIM
11263                                           . $fields[$CHARNAME];
11264             }
11265             elsif ($fields[$CHARNAME] =~ /^CJK/) {
11266
11267                 # The name for these contains the code point itself, and all
11268                 # are defined to have the same base name, regardless of what
11269                 # is in the file.  They are stored in our internal structure
11270                 # with a map type of $CP_IN_NAME
11271                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11272                                         = $CMD_DELIM
11273                                            . $MAP_TYPE_CMD
11274                                            . '='
11275                                            . $CP_IN_NAME
11276                                            . $CMD_DELIM
11277                                            . 'CJK UNIFIED IDEOGRAPH';
11278
11279             }
11280             elsif ($fields[$CATEGORY] eq 'Co'
11281                      || $fields[$CATEGORY] eq 'Cs')
11282             {
11283                 # The names of all the code points in these ranges are set to
11284                 # null, as there are no names for the private use and
11285                 # surrogate code points.
11286
11287                 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
11288             }
11289             else {
11290                 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
11291             }
11292
11293             # The first line of the range caused everything else to be output,
11294             # and then its values were stored as the beginning values for the
11295             # next set of ranges, which this one ends.  Now, for each value,
11296             # add a command to tell the handler that these values should not
11297             # replace any existing ones in our database.
11298             foreach my $i (0 .. $last_field) {
11299                 $previous_fields[$i] = $CMD_DELIM
11300                                         . $REPLACE_CMD
11301                                         . '='
11302                                         . $NO
11303                                         . $CMD_DELIM
11304                                         . $previous_fields[$i];
11305             }
11306
11307             # And change things so it looks like the entire range has been
11308             # gone through with this being the final part of it.  Adding the
11309             # command above to each field will cause this range to be flushed
11310             # during the next iteration, as it guaranteed that the stored
11311             # field won't match whatever value the next one has.
11312             $previous_cp = $cp;
11313             $decimal_previous_cp = $decimal_cp;
11314
11315             # We are now set up for the next iteration; so skip the remaining
11316             # code in this subroutine that does the same thing, but doesn't
11317             # know about these ranges.
11318             $_ = "";
11319
11320             return;
11321         }
11322
11323         # On the very first line, we fake it so the code below thinks there is
11324         # nothing to output, and initialize so that when it does get output it
11325         # uses the first line's values for the lowest part of the range.
11326         # (One could avoid this by using peek(), but then one would need to
11327         # know the adjustments done above and do the same ones in the setup
11328         # routine; not worth it)
11329         if ($first_time) {
11330             $first_time = 0;
11331             @previous_fields = @fields;
11332             @start = ($cp) x scalar @fields;
11333             $decimal_previous_cp = $decimal_cp - 1;
11334         }
11335
11336         # For each field, output the stored up ranges that this code point
11337         # doesn't fit in.  Earlier we figured out if all ranges should be
11338         # terminated because of changing the replace or map type styles, or if
11339         # there is a gap between this new code point and the previous one, and
11340         # that is stored in $force_output.  But even if those aren't true, we
11341         # need to output the range if this new code point's value for the
11342         # given property doesn't match the stored range's.
11343         #local $to_trace = 1 if main::DEBUG;
11344         foreach my $i (0 .. $last_field) {
11345             my $field = $fields[$i];
11346             if ($force_output || $field ne $previous_fields[$i]) {
11347
11348                 # Flush the buffer of stored values.
11349                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11350
11351                 # Start a new range with this code point and its value
11352                 $start[$i] = $cp;
11353                 $previous_fields[$i] = $field;
11354             }
11355         }
11356
11357         # Set the values for the next time.
11358         $previous_cp = $cp;
11359         $decimal_previous_cp = $decimal_cp;
11360
11361         # The input line has generated whatever adjusted lines are needed, and
11362         # should not be looked at further.
11363         $_ = "";
11364         return;
11365     }
11366
11367     sub EOF_UnicodeData {
11368         # Called upon EOF to flush the buffers, and create the Hangul
11369         # decomposition mappings if needed.
11370
11371         my $file = shift;
11372         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11373
11374         # Flush the buffers.
11375         foreach my $i (0 .. $last_field) {
11376             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11377         }
11378
11379         if (-e 'Jamo.txt') {
11380
11381             # The algorithm is published by Unicode, based on values in
11382             # Jamo.txt, (which should have been processed before this
11383             # subroutine), and the results left in %Jamo
11384             unless (%Jamo) {
11385                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
11386                 return;
11387             }
11388
11389             # If the full decomposition map table is being output, insert
11390             # into it the Hangul syllable mappings.  This is to avoid having
11391             # to publish a subroutine in it to compute them.  (which would
11392             # essentially be this code.)  This uses the algorithm published by
11393             # Unicode.  (No hangul syllables in version 1)
11394             if ($v_version ge v2.0.0
11395                 && property_ref('Decomposition_Mapping')->to_output_map) {
11396                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
11397                     use integer;
11398                     my $SIndex = $S - $SBase;
11399                     my $L = $LBase + $SIndex / $NCount;
11400                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
11401                     my $T = $TBase + $SIndex % $TCount;
11402
11403                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
11404                     my $decomposition = sprintf("%04X %04X", $L, $V);
11405                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
11406                     $file->insert_adjusted_lines(
11407                                 sprintf("%04X; Decomposition_Mapping; %s",
11408                                         $S,
11409                                         $decomposition));
11410                 }
11411             }
11412         }
11413
11414         return;
11415     }
11416
11417     sub filter_v1_ucd {
11418         # Fix UCD lines in version 1.  This is probably overkill, but this
11419         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
11420         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
11421         #       removed.  This program retains them
11422         # 2)    didn't include ranges, which it should have, and which are now
11423         #       added in @corrected_lines below.  It was hand populated by
11424         #       taking the data from Version 2, verified by analyzing
11425         #       DAge.txt.
11426         # 3)    There is a syntax error in the entry for U+09F8 which could
11427         #       cause problems for utf8_heavy, and so is changed.  It's
11428         #       numeric value was simply a minus sign, without any number.
11429         #       (Eventually Unicode changed the code point to non-numeric.)
11430         # 4)    The decomposition types often don't match later versions
11431         #       exactly, and the whole syntax of that field is different; so
11432         #       the syntax is changed as well as the types to their later
11433         #       terminology.  Otherwise normalize.pm would be very unhappy
11434         # 5)    Many ccc classes are different.  These are left intact.
11435         # 6)    U+FF10..U+FF19 are missing their numeric values in all three
11436         #       fields.  These are unchanged because it doesn't really cause
11437         #       problems for Perl.
11438         # 7)    A number of code points, such as controls, don't have their
11439         #       Unicode Version 1 Names in this file.  These are added.
11440         # 8)    A number of Symbols were marked as Lm.  This changes those in
11441         #       the Latin1 range, so that regexes work.
11442         # 9)    The odd characters U+03DB .. U+03E1 weren't encoded but are
11443         #       referred to by their lc equivalents.  Not fixed.
11444
11445         my @corrected_lines = split /\n/, <<'END';
11446 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
11447 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11448 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
11449 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
11450 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
11451 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11452 END
11453
11454         my $file = shift;
11455         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11456
11457         #local $to_trace = 1 if main::DEBUG;
11458         trace $_ if main::DEBUG && $to_trace;
11459
11460         # -1 => retain trailing null fields
11461         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11462
11463         # At the first place that is wrong in the input, insert all the
11464         # corrections, replacing the wrong line.
11465         if ($code_point eq '4E00') {
11466             my @copy = @corrected_lines;
11467             $_ = shift @copy;
11468             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11469
11470             $file->insert_lines(@copy);
11471         }
11472         elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
11473
11474             # There are no Lm characters in Latin1; these should be 'Sk', but
11475             # there isn't that in V1.
11476             $fields[$CATEGORY] = 'So';
11477         }
11478
11479         if ($fields[$NUMERIC] eq '-') {
11480             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
11481         }
11482
11483         if  ($fields[$PERL_DECOMPOSITION] ne "") {
11484
11485             # Several entries have this change to superscript 2 or 3 in the
11486             # middle.  Convert these to the modern version, which is to use
11487             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
11488             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
11489             # 'HHHH HHHH 00B3 HHHH'.
11490             # It turns out that all of these that don't have another
11491             # decomposition defined at the beginning of the line have the
11492             # <square> decomposition in later releases.
11493             if ($code_point ne '00B2' && $code_point ne '00B3') {
11494                 if  ($fields[$PERL_DECOMPOSITION]
11495                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
11496                 {
11497                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
11498                         $fields[$PERL_DECOMPOSITION] = '<square> '
11499                         . $fields[$PERL_DECOMPOSITION];
11500                     }
11501                 }
11502             }
11503
11504             # If is like '<+circled> 0052 <-circled>', convert to
11505             # '<circled> 0052'
11506             $fields[$PERL_DECOMPOSITION] =~
11507                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
11508
11509             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
11510             $fields[$PERL_DECOMPOSITION] =~
11511                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
11512             or $fields[$PERL_DECOMPOSITION] =~
11513                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
11514             or $fields[$PERL_DECOMPOSITION] =~
11515                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
11516             or $fields[$PERL_DECOMPOSITION] =~
11517                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
11518
11519             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
11520             $fields[$PERL_DECOMPOSITION] =~
11521                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
11522
11523             # Change names to modern form.
11524             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
11525             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
11526             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
11527             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
11528
11529             # One entry has weird braces
11530             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
11531
11532             # One entry at U+2116 has an extra <sup>
11533             $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
11534         }
11535
11536         $_ = join ';', $code_point, @fields;
11537         trace $_ if main::DEBUG && $to_trace;
11538         return;
11539     }
11540
11541     sub filter_bad_Nd_ucd {
11542         # Early versions specified a value in the decimal digit field even
11543         # though the code point wasn't a decimal digit.  Clear the field in
11544         # that situation, so that the main code doesn't think it is a decimal
11545         # digit.
11546
11547         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11548         if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
11549             $fields[$PERL_DECIMAL_DIGIT] = "";
11550             $_ = join ';', $code_point, @fields;
11551         }
11552         return;
11553     }
11554
11555     my @U1_control_names = split /\n/, <<'END';
11556 NULL
11557 START OF HEADING
11558 START OF TEXT
11559 END OF TEXT
11560 END OF TRANSMISSION
11561 ENQUIRY
11562 ACKNOWLEDGE
11563 BELL
11564 BACKSPACE
11565 HORIZONTAL TABULATION
11566 LINE FEED
11567 VERTICAL TABULATION
11568 FORM FEED
11569 CARRIAGE RETURN
11570 SHIFT OUT
11571 SHIFT IN
11572 DATA LINK ESCAPE
11573 DEVICE CONTROL ONE
11574 DEVICE CONTROL TWO
11575 DEVICE CONTROL THREE
11576 DEVICE CONTROL FOUR
11577 NEGATIVE ACKNOWLEDGE
11578 SYNCHRONOUS IDLE
11579 END OF TRANSMISSION BLOCK
11580 CANCEL
11581 END OF MEDIUM
11582 SUBSTITUTE
11583 ESCAPE
11584 FILE SEPARATOR
11585 GROUP SEPARATOR
11586 RECORD SEPARATOR
11587 UNIT SEPARATOR
11588 DELETE
11589 BREAK PERMITTED HERE
11590 NO BREAK HERE
11591 INDEX
11592 NEXT LINE
11593 START OF SELECTED AREA
11594 END OF SELECTED AREA
11595 CHARACTER TABULATION SET
11596 CHARACTER TABULATION WITH JUSTIFICATION
11597 LINE TABULATION SET
11598 PARTIAL LINE DOWN
11599 PARTIAL LINE UP
11600 REVERSE LINE FEED
11601 SINGLE SHIFT TWO
11602 SINGLE SHIFT THREE
11603 DEVICE CONTROL STRING
11604 PRIVATE USE ONE
11605 PRIVATE USE TWO
11606 SET TRANSMIT STATE
11607 CANCEL CHARACTER
11608 MESSAGE WAITING
11609 START OF GUARDED AREA
11610 END OF GUARDED AREA
11611 START OF STRING
11612 SINGLE CHARACTER INTRODUCER
11613 CONTROL SEQUENCE INTRODUCER
11614 STRING TERMINATOR
11615 OPERATING SYSTEM COMMAND
11616 PRIVACY MESSAGE
11617 APPLICATION PROGRAM COMMAND
11618 END
11619
11620     sub filter_early_U1_names {
11621         # Very early versions did not have the Unicode_1_name field specified.
11622         # They differed in which ones were present; make sure a U1 name
11623         # exists, so that Unicode::UCD::charinfo will work
11624
11625         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11626
11627
11628         # @U1_control names above are entirely positional, so we pull them out
11629         # in the exact order required, with gaps for the ones that don't have
11630         # names.
11631         if ($code_point =~ /^00[01]/
11632             || $code_point eq '007F'
11633             || $code_point =~ /^008[2-9A-F]/
11634             || $code_point =~ /^009[0-8A-F]/)
11635         {
11636             my $u1_name = shift @U1_control_names;
11637             $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
11638             $_ = join ';', $code_point, @fields;
11639         }
11640         return;
11641     }
11642
11643     sub filter_v2_1_5_ucd {
11644         # A dozen entries in this 2.1.5 file had the mirrored and numeric
11645         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
11646         # column appears to be N, swap it back.
11647
11648         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11649         if ($fields[$NUMERIC] eq 'N') {
11650             $fields[$NUMERIC] = $fields[$MIRRORED];
11651             $fields[$MIRRORED] = 'N';
11652             $_ = join ';', $code_point, @fields;
11653         }
11654         return;
11655     }
11656
11657     sub filter_v6_ucd {
11658
11659         # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
11660         # it wasn't accepted, to allow for some deprecation cycles.  This
11661         # function is not called after 5.16
11662
11663         return if $_ !~ /^(?:0007|1F514|070F);/;
11664
11665         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11666         if ($code_point eq '0007') {
11667             $fields[$CHARNAME] = "";
11668         }
11669         elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
11670                             # http://www.unicode.org/versions/corrigendum8.html
11671             $fields[$BIDI] = "AL";
11672         }
11673         elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
11674             $fields[$CHARNAME] = "";
11675         }
11676
11677         $_ = join ';', $code_point, @fields;
11678
11679         return;
11680     }
11681 } # End closure for UnicodeData
11682
11683 sub process_GCB_test {
11684
11685     my $file = shift;
11686     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11687
11688     while ($file->next_line) {
11689         push @backslash_X_tests, $_;
11690     }
11691
11692     return;
11693 }
11694
11695 sub process_NamedSequences {
11696     # NamedSequences.txt entries are just added to an array.  Because these
11697     # don't look like the other tables, they have their own handler.
11698     # An example:
11699     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
11700     #
11701     # This just adds the sequence to an array for later handling
11702
11703     my $file = shift;
11704     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11705
11706     while ($file->next_line) {
11707         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
11708         if (@remainder) {
11709             $file->carp_bad_line(
11710                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
11711             next;
11712         }
11713
11714         # Note single \t in keeping with special output format of
11715         # Perl_charnames.  But it turns out that the code points don't have to
11716         # be 5 digits long, like the rest, based on the internal workings of
11717         # charnames.pm.  This could be easily changed for consistency.
11718         push @named_sequences, "$sequence\t$name";
11719     }
11720     return;
11721 }
11722
11723 { # Closure
11724
11725     my $first_range;
11726
11727     sub  filter_early_ea_lb {
11728         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
11729         # third field be the name of the code point, which can be ignored in
11730         # most cases.  But it can be meaningful if it marks a range:
11731         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
11732         # 3400;W;<CJK Ideograph Extension A, First>
11733         #
11734         # We need to see the First in the example above to know it's a range.
11735         # They did not use the later range syntaxes.  This routine changes it
11736         # to use the modern syntax.
11737         # $1 is the Input_file object.
11738
11739         my @fields = split /\s*;\s*/;
11740         if ($fields[2] =~ /^<.*, First>/) {
11741             $first_range = $fields[0];
11742             $_ = "";
11743         }
11744         elsif ($fields[2] =~ /^<.*, Last>/) {
11745             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
11746         }
11747         else {
11748             undef $first_range;
11749             $_ = "$fields[0]; $fields[1]";
11750         }
11751
11752         return;
11753     }
11754 }
11755
11756 sub filter_old_style_arabic_shaping {
11757     # Early versions used a different term for the later one.
11758
11759     my @fields = split /\s*;\s*/;
11760     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
11761     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
11762     $_ = join ';', @fields;
11763     return;
11764 }
11765
11766 { # Closure
11767     my $lc; # Table for lowercase mapping
11768     my $tc;
11769     my $uc;
11770     my %special_casing_code_points;
11771
11772     sub setup_special_casing {
11773         # SpecialCasing.txt contains the non-simple case change mappings.  The
11774         # simple ones are in UnicodeData.txt, which should already have been
11775         # read in to the full property data structures, so as to initialize
11776         # these with the simple ones.  Then the SpecialCasing.txt entries
11777         # add or overwrite the ones which have different full mappings.
11778
11779         # This routine sees if the simple mappings are to be output, and if
11780         # so, copies what has already been put into the full mapping tables,
11781         # while they still contain only the simple mappings.
11782
11783         # The reason it is done this way is that the simple mappings are
11784         # probably not going to be output, so it saves work to initialize the
11785         # full tables with the simple mappings, and then overwrite those
11786         # relatively few entries in them that have different full mappings,
11787         # and thus skip the simple mapping tables altogether.
11788
11789         my $file= shift;
11790         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11791
11792         $lc = property_ref('lc');
11793         $tc = property_ref('tc');
11794         $uc = property_ref('uc');
11795
11796         # For each of the case change mappings...
11797         foreach my $full_casing_table ($lc, $tc, $uc) {
11798             my $full_casing_name = $full_casing_table->name;
11799             my $full_casing_full_name = $full_casing_table->full_name;
11800             unless (defined $full_casing_table
11801                     && ! $full_casing_table->is_empty)
11802             {
11803                 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
11804             }
11805
11806             # Create a table in the old-style format and with the original
11807             # file name for backwards compatibility with applications that
11808             # read it directly.  The new tables contain both the simple and
11809             # full maps, and the old are missing simple maps when there is a
11810             # conflicting full one.  Probably it would have been ok to add
11811             # those to the legacy version, as was already done in 5.14 to the
11812             # case folding one, but this was not done, out of an abundance of
11813             # caution.  The tables are set up here before we deal with the
11814             # full maps so that as we handle those, we can override the simple
11815             # maps for them in the legacy table, and merely add them in the
11816             # new-style one.
11817             my $legacy = Property->new("Legacy_" . $full_casing_full_name,
11818                                 File => $full_casing_full_name
11819                                                           =~ s/case_Mapping//r,
11820                                 Format => $HEX_FORMAT,
11821                                 Default_Map => $CODE_POINT,
11822                                 Initialize => $full_casing_table,
11823                                 Replacement_Property => $full_casing_full_name,
11824             );
11825
11826             $full_casing_table->add_comment(join_lines( <<END
11827 This file includes both the simple and full case changing maps.  The simple
11828 ones are in the main body of the table below, and the full ones adding to or
11829 overriding them are in the hash.
11830 END
11831             ));
11832
11833             # The simple version's name in each mapping merely has an 's' in
11834             # front of the full one's
11835             my $simple_name = 's' . $full_casing_name;
11836             my $simple = property_ref($simple_name);
11837             $simple->initialize($full_casing_table) if $simple->to_output_map();
11838         }
11839
11840         return;
11841     }
11842
11843     sub filter_2_1_8_special_casing_line {
11844
11845         # This version had duplicate entries in this file.  Delete all but the
11846         # first one
11847         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11848                                               # fields
11849         if (exists $special_casing_code_points{$fields[0]}) {
11850             $_ = "";
11851             return;
11852         }
11853
11854         $special_casing_code_points{$fields[0]} = 1;
11855         filter_special_casing_line(@_);
11856     }
11857
11858     sub filter_special_casing_line {
11859         # Change the format of $_ from SpecialCasing.txt into something that
11860         # the generic handler understands.  Each input line contains three
11861         # case mappings.  This will generate three lines to pass to the
11862         # generic handler for each of those.
11863
11864         # The input syntax (after stripping comments and trailing white space
11865         # is like one of the following (with the final two being entries that
11866         # we ignore):
11867         # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
11868         # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
11869         # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
11870         # Note the trailing semi-colon, unlike many of the input files.  That
11871         # means that there will be an extra null field generated by the split
11872
11873         my $file = shift;
11874         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11875
11876         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11877                                               # fields
11878
11879         # field #4 is when this mapping is conditional.  If any of these get
11880         # implemented, it would be by hard-coding in the casing functions in
11881         # the Perl core, not through tables.  But if there is a new condition
11882         # we don't know about, output a warning.  We know about all the
11883         # conditions through 6.0
11884         if ($fields[4] ne "") {
11885             my @conditions = split ' ', $fields[4];
11886             if ($conditions[0] ne 'tr'  # We know that these languages have
11887                                         # conditions, and some are multiple
11888                 && $conditions[0] ne 'az'
11889                 && $conditions[0] ne 'lt'
11890
11891                 # And, we know about a single condition Final_Sigma, but
11892                 # nothing else.
11893                 && ($v_version gt v5.2.0
11894                     && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
11895             {
11896                 $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
11897             }
11898             elsif ($conditions[0] ne 'Final_Sigma') {
11899
11900                     # Don't print out a message for Final_Sigma, because we
11901                     # have hard-coded handling for it.  (But the standard
11902                     # could change what the rule should be, but it wouldn't
11903                     # show up here anyway.
11904
11905                     print "# SKIPPING Special Casing: $_\n"
11906                                                     if $verbosity >= $VERBOSE;
11907             }
11908             $_ = "";
11909             return;
11910         }
11911         elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
11912             $file->carp_bad_line('Extra fields');
11913             $_ = "";
11914             return;
11915         }
11916
11917         my $decimal_code_point = hex $fields[0];
11918
11919         # Loop to handle each of the three mappings in the input line, in
11920         # order, with $i indicating the current field number.
11921         my $i = 0;
11922         for my $object ($lc, $tc, $uc) {
11923             $i++;   # First time through, $i = 0 ... 3rd time = 3
11924
11925             my $value = $object->value_of($decimal_code_point);
11926             $value = ($value eq $CODE_POINT)
11927                       ? $decimal_code_point
11928                       : hex $value;
11929
11930             # If this isn't a multi-character mapping, it should already have
11931             # been read in.
11932             if ($fields[$i] !~ / /) {
11933                 if ($value != hex $fields[$i]) {
11934                     Carp::my_carp("Bad news. UnicodeData.txt thinks "
11935                                   . $object->name
11936                                   . "(0x$fields[0]) is $value"
11937                                   . " and SpecialCasing.txt thinks it is "
11938                                   . hex($fields[$i])
11939                                   . ".  Good luck.  Retaining UnicodeData value, and proceeding anyway.");
11940                 }
11941             }
11942             else {
11943
11944                 # The mapping goes into both the legacy table, in which it
11945                 # replaces the simple one...
11946                 $file->insert_adjusted_lines("$fields[0]; Legacy_"
11947                                              . $object->full_name
11948                                              . "; $fields[$i]");
11949
11950                 # ... and the regular table, in which it is additional,
11951                 # beyond the simple mapping.
11952                 $file->insert_adjusted_lines("$fields[0]; "
11953                                              . $object->name
11954                                             . "; "
11955                                             . $CMD_DELIM
11956                                             . "$REPLACE_CMD=$MULTIPLE_BEFORE"
11957                                             . $CMD_DELIM
11958                                             . $fields[$i]);
11959             }
11960         }
11961
11962         # Everything has been handled by the insert_adjusted_lines()
11963         $_ = "";
11964
11965         return;
11966     }
11967 }
11968
11969 sub filter_old_style_case_folding {
11970     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
11971     # and later style.  Different letters were used in the earlier.
11972
11973     my $file = shift;
11974     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11975
11976     my @fields = split /\s*;\s*/;
11977     if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
11978         $fields[1] = 'I';
11979     }
11980     elsif ($fields[1] eq 'L') {
11981         $fields[1] = 'C';             # L => C always
11982     }
11983     elsif ($fields[1] eq 'E') {
11984         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
11985             $fields[1] = 'F'
11986         }
11987         else {
11988             $fields[1] = 'C'
11989         }
11990     }
11991     else {
11992         $file->carp_bad_line("Expecting L or E in second field");
11993         $_ = "";
11994         return;
11995     }
11996     $_ = join("; ", @fields) . ';';
11997     return;
11998 }
11999
12000 { # Closure for case folding
12001
12002     # Create the map for simple only if are going to output it, for otherwise
12003     # it takes no part in anything we do.
12004     my $to_output_simple;
12005
12006     sub setup_case_folding($) {
12007         # Read in the case foldings in CaseFolding.txt.  This handles both
12008         # simple and full case folding.
12009
12010         $to_output_simple
12011                         = property_ref('Simple_Case_Folding')->to_output_map;
12012
12013         if (! $to_output_simple) {
12014             property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12015         }
12016
12017         # If we ever wanted to show that these tables were combined, a new
12018         # property method could be created, like set_combined_props()
12019         property_ref('Case_Folding')->add_comment(join_lines( <<END
12020 This file includes both the simple and full case folding maps.  The simple
12021 ones are in the main body of the table below, and the full ones adding to or
12022 overriding them are in the hash.
12023 END
12024         ));
12025         return;
12026     }
12027
12028     sub filter_case_folding_line {
12029         # Called for each line in CaseFolding.txt
12030         # Input lines look like:
12031         # 0041; C; 0061; # LATIN CAPITAL LETTER A
12032         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12033         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12034         #
12035         # 'C' means that folding is the same for both simple and full
12036         # 'F' that it is only for full folding
12037         # 'S' that it is only for simple folding
12038         # 'T' is locale-dependent, and ignored
12039         # 'I' is a type of 'F' used in some early releases.
12040         # Note the trailing semi-colon, unlike many of the input files.  That
12041         # means that there will be an extra null field generated by the split
12042         # below, which we ignore and hence is not an error.
12043
12044         my $file = shift;
12045         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12046
12047         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12048         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12049             $file->carp_bad_line('Extra fields');
12050             $_ = "";
12051             return;
12052         }
12053
12054         if ($type =~ / ^ [IT] $/x) {   # Skip Turkic case folding, is locale dependent
12055             $_ = "";
12056             return;
12057         }
12058
12059         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12060         # I are all full foldings; S is single-char.  For S, there is always
12061         # an F entry, so we must allow multiple values for the same code
12062         # point.  Fortunately this table doesn't need further manipulation
12063         # which would preclude using multiple-values.  The S is now included
12064         # so that _swash_inversion_hash() is able to construct closures
12065         # without having to worry about F mappings.
12066         if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12067             $_ = "$range; Case_Folding; "
12068                  . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12069         }
12070         else {
12071             $_ = "";
12072             $file->carp_bad_line('Expecting C F I S or T in second field');
12073         }
12074
12075         # C and S are simple foldings, but simple case folding is not needed
12076         # unless we explicitly want its map table output.
12077         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12078             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12079         }
12080
12081         return;
12082     }
12083
12084 } # End case fold closure
12085
12086 sub filter_jamo_line {
12087     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
12088     # from this file that is used in generating the Name property for Jamo
12089     # code points.  But, it also is used to convert early versions' syntax
12090     # into the modern form.  Here are two examples:
12091     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
12092     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
12093     #
12094     # The input is $_, the output is $_ filtered.
12095
12096     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
12097
12098     # Let the caller handle unexpected input.  In earlier versions, there was
12099     # a third field which is supposed to be a comment, but did not have a '#'
12100     # before it.
12101     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12102
12103     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
12104                                 # beginning.
12105
12106     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
12107     $fields[1] = 'R' if $fields[0] eq '1105';
12108
12109     # Add to structure so can generate Names from it.
12110     my $cp = hex $fields[0];
12111     my $short_name = $fields[1];
12112     $Jamo{$cp} = $short_name;
12113     if ($cp <= $LBase + $LCount) {
12114         $Jamo_L{$short_name} = $cp - $LBase;
12115     }
12116     elsif ($cp <= $VBase + $VCount) {
12117         $Jamo_V{$short_name} = $cp - $VBase;
12118     }
12119     elsif ($cp <= $TBase + $TCount) {
12120         $Jamo_T{$short_name} = $cp - $TBase;
12121     }
12122     else {
12123         Carp::my_carp_bug("Unexpected Jamo code point in $_");
12124     }
12125
12126
12127     # Reassemble using just the first two fields to look like a typical
12128     # property file line
12129     $_ = "$fields[0]; $fields[1]";
12130
12131     return;
12132 }
12133
12134 sub register_fraction($) {
12135     # This registers the input rational number so that it can be passed on to
12136     # utf8_heavy.pl, both in rational and floating forms.
12137
12138     my $rational = shift;
12139
12140     my $float = eval $rational;
12141     $nv_floating_to_rational{$float} = $rational;
12142     return;
12143 }
12144
12145 sub filter_numeric_value_line {
12146     # DNumValues contains lines of a different syntax than the typical
12147     # property file:
12148     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
12149     #
12150     # This routine transforms $_ containing the anomalous syntax to the
12151     # typical, by filtering out the extra columns, and convert early version
12152     # decimal numbers to strings that look like rational numbers.
12153
12154     my $file = shift;
12155     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12156
12157     # Starting in 5.1, there is a rational field.  Just use that, omitting the
12158     # extra columns.  Otherwise convert the decimal number in the second field
12159     # to a rational, and omit extraneous columns.
12160     my @fields = split /\s*;\s*/, $_, -1;
12161     my $rational;
12162
12163     if ($v_version ge v5.1.0) {
12164         if (@fields != 4) {
12165             $file->carp_bad_line('Not 4 semi-colon separated fields');
12166             $_ = "";
12167             return;
12168         }
12169         $rational = $fields[3];
12170         $_ = join '; ', @fields[ 0, 3 ];
12171     }
12172     else {
12173
12174         # Here, is an older Unicode file, which has decimal numbers instead of
12175         # rationals in it.  Use the fraction to calculate the denominator and
12176         # convert to rational.
12177
12178         if (@fields != 2 && @fields != 3) {
12179             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
12180             $_ = "";
12181             return;
12182         }
12183
12184         my $codepoints = $fields[0];
12185         my $decimal = $fields[1];
12186         if ($decimal =~ s/\.0+$//) {
12187
12188             # Anything ending with a decimal followed by nothing but 0's is an
12189             # integer
12190             $_ = "$codepoints; $decimal";
12191             $rational = $decimal;
12192         }
12193         else {
12194
12195             my $denominator;
12196             if ($decimal =~ /\.50*$/) {
12197                 $denominator = 2;
12198             }
12199
12200             # Here have the hardcoded repeating decimals in the fraction, and
12201             # the denominator they imply.  There were only a few denominators
12202             # in the older Unicode versions of this file which this code
12203             # handles, so it is easy to convert them.
12204
12205             # The 4 is because of a round-off error in the Unicode 3.2 files
12206             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
12207                 $denominator = 3;
12208             }
12209             elsif ($decimal =~ /\.[27]50*$/) {
12210                 $denominator = 4;
12211             }
12212             elsif ($decimal =~ /\.[2468]0*$/) {
12213                 $denominator = 5;
12214             }
12215             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
12216                 $denominator = 6;
12217             }
12218             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
12219                 $denominator = 8;
12220             }
12221             if ($denominator) {
12222                 my $sign = ($decimal < 0) ? "-" : "";
12223                 my $numerator = int((abs($decimal) * $denominator) + .5);
12224                 $rational = "$sign$numerator/$denominator";
12225                 $_ = "$codepoints; $rational";
12226             }
12227             else {
12228                 $file->carp_bad_line("Can't cope with number '$decimal'.");
12229                 $_ = "";
12230                 return;
12231             }
12232         }
12233     }
12234
12235     register_fraction($rational) if $rational =~ qr{/};
12236     return;
12237 }
12238
12239 { # Closure
12240     my %unihan_properties;
12241
12242     sub setup_unihan {
12243         # Do any special setup for Unihan properties.
12244
12245         # This property gives the wrong computed type, so override.
12246         my $usource = property_ref('kIRG_USource');
12247         $usource->set_type($STRING) if defined $usource;
12248
12249         # This property is to be considered binary (it says so in
12250         # http://www.unicode.org/reports/tr38/)
12251         my $iicore = property_ref('kIICore');
12252         if (defined $iicore) {
12253             $iicore->set_type($FORCED_BINARY);
12254             $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
12255
12256             # Unicode doesn't include the maps for this property, so don't
12257             # warn that they are missing.
12258             $iicore->set_pre_declared_maps(0);
12259             $iicore->add_comment(join_lines( <<END
12260 This property contains enum values, but Unicode UAX #38 says it should be
12261 interpreted as binary, so Perl creates tables for both 1) its enum values,
12262 plus 2) true/false tables in which it is considered true for all code points
12263 that have a non-null value
12264 END
12265             ));
12266         }
12267
12268         return;
12269     }
12270
12271     sub filter_unihan_line {
12272         # Change unihan db lines to look like the others in the db.  Here is
12273         # an input sample:
12274         #   U+341C        kCangjie        IEKN
12275
12276         # Tabs are used instead of semi-colons to separate fields; therefore
12277         # they may have semi-colons embedded in them.  Change these to periods
12278         # so won't screw up the rest of the code.
12279         s/;/./g;
12280
12281         # Remove lines that don't look like ones we accept.
12282         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
12283             $_ = "";
12284             return;
12285         }
12286
12287         # Extract the property, and save a reference to its object.
12288         my $property = $1;
12289         if (! exists $unihan_properties{$property}) {
12290             $unihan_properties{$property} = property_ref($property);
12291         }
12292
12293         # Don't do anything unless the property is one we're handling, which
12294         # we determine by seeing if there is an object defined for it or not
12295         if (! defined $unihan_properties{$property}) {
12296             $_ = "";
12297             return;
12298         }
12299
12300         # Convert the tab separators to our standard semi-colons, and convert
12301         # the U+HHHH notation to the rest of the standard's HHHH
12302         s/\t/;/g;
12303         s/\b U \+ (?= $code_point_re )//xg;
12304
12305         #local $to_trace = 1 if main::DEBUG;
12306         trace $_ if main::DEBUG && $to_trace;
12307
12308         return;
12309     }
12310 }
12311
12312 sub filter_blocks_lines {
12313     # In the Blocks.txt file, the names of the blocks don't quite match the
12314     # names given in PropertyValueAliases.txt, so this changes them so they
12315     # do match:  Blanks and hyphens are changed into underscores.  Also makes
12316     # early release versions look like later ones
12317     #
12318     # $_ is transformed to the correct value.
12319
12320     my $file = shift;
12321         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12322
12323     if ($v_version lt v3.2.0) {
12324         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
12325             $_ = "";
12326             return;
12327         }
12328
12329         # Old versions used a different syntax to mark the range.
12330         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
12331     }
12332
12333     my @fields = split /\s*;\s*/, $_, -1;
12334     if (@fields != 2) {
12335         $file->carp_bad_line("Expecting exactly two fields");
12336         $_ = "";
12337         return;
12338     }
12339
12340     # Change hyphens and blanks in the block name field only
12341     $fields[1] =~ s/[ -]/_/g;
12342     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
12343
12344     $_ = join("; ", @fields);
12345     return;
12346 }
12347
12348 { # Closure
12349     my $current_property;
12350
12351     sub filter_old_style_proplist {
12352         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
12353         # was in a completely different syntax.  Ken Whistler of Unicode says
12354         # that it was something he used as an aid for his own purposes, but
12355         # was never an official part of the standard.  Many of the properties
12356         # in it were incorporated into the later PropList.txt, but some were
12357         # not.  This program uses this early file to generate property tables
12358         # that are otherwise not accessible in the early UCD's.  It does this
12359         # for the ones that eventually became official, and don't appear to be
12360         # too different in their contents from the later official version, and
12361         # throws away the rest.  It could be argued that the ones it generates
12362         # were probably not really official at that time, so should be
12363         # ignored.  You can easily modify things to skip all of them by
12364         # changing this function to just set $_ to "", and return; and to skip
12365         # certain of them by by simply removing their declarations from
12366         # get_old_property_aliases().
12367         #
12368         # Here is a list of all the ones that are thrown away:
12369         #   Alphabetic                   The definitions for this are very
12370         #                                defective, so better to not mislead
12371         #                                people into thinking it works.
12372         #                                Instead the Perl extension of the
12373         #                                same name is constructed from first
12374         #                                principles.
12375         #   Bidi=*                       duplicates UnicodeData.txt
12376         #   Combining                    never made into official property;
12377         #                                is \P{ccc=0}
12378         #   Composite                    never made into official property.
12379         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
12380         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
12381         #   Delimiter                    never made into official property;
12382         #                                removed in 3.0.1
12383         #   Format Control               never made into official property;
12384         #                                similar to gc=cf
12385         #   High Surrogate               duplicates Blocks.txt
12386         #   Ignorable Control            never made into official property;
12387         #                                similar to di=y
12388         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
12389         #   Left of Pair                 never made into official property;
12390         #   Line Separator               duplicates UnicodeData.txt: gc=zl
12391         #   Low Surrogate                duplicates Blocks.txt
12392         #   Non-break                    was actually listed as a property
12393         #                                in 3.2, but without any code
12394         #                                points.  Unicode denies that this
12395         #                                was ever an official property
12396         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
12397         #   Numeric                      duplicates UnicodeData.txt: gc=cc
12398         #   Paired Punctuation           never made into official property;
12399         #                                appears to be gc=ps + gc=pe
12400         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
12401         #   Private Use                  duplicates UnicodeData.txt: gc=co
12402         #   Private Use High Surrogate   duplicates Blocks.txt
12403         #   Punctuation                  duplicates UnicodeData.txt: gc=p
12404         #   Space                        different definition than eventual
12405         #                                one.
12406         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
12407         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cn
12408         #   Zero-width                   never made into official property;
12409         #                                subset of gc=cf
12410         # Most of the properties have the same names in this file as in later
12411         # versions, but a couple do not.
12412         #
12413         # This subroutine filters $_, converting it from the old style into
12414         # the new style.  Here's a sample of the old-style
12415         #
12416         #   *******************************************
12417         #
12418         #   Property dump for: 0x100000A0 (Join Control)
12419         #
12420         #   200C..200D  (2 chars)
12421         #
12422         # In the example, the property is "Join Control".  It is kept in this
12423         # closure between calls to the subroutine.  The numbers beginning with
12424         # 0x were internal to Ken's program that generated this file.
12425
12426         # If this line contains the property name, extract it.
12427         if (/^Property dump for: [^(]*\((.*)\)/) {
12428             $_ = $1;
12429
12430             # Convert white space to underscores.
12431             s/ /_/g;
12432
12433             # Convert the few properties that don't have the same name as
12434             # their modern counterparts
12435             s/Identifier_Part/ID_Continue/
12436             or s/Not_a_Character/NChar/;
12437
12438             # If the name matches an existing property, use it.
12439             if (defined property_ref($_)) {
12440                 trace "new property=", $_ if main::DEBUG && $to_trace;
12441                 $current_property = $_;
12442             }
12443             else {        # Otherwise discard it
12444                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
12445                 undef $current_property;
12446             }
12447             $_ = "";    # The property is saved for the next lines of the
12448                         # file, but this defining line is of no further use,
12449                         # so clear it so that the caller won't process it
12450                         # further.
12451         }
12452         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
12453
12454             # Here, the input line isn't a header defining a property for the
12455             # following section, and either we aren't in such a section, or
12456             # the line doesn't look like one that defines the code points in
12457             # such a section.  Ignore this line.
12458             $_ = "";
12459         }
12460         else {
12461
12462             # Here, we have a line defining the code points for the current
12463             # stashed property.  Anything starting with the first blank is
12464             # extraneous.  Otherwise, it should look like a normal range to
12465             # the caller.  Append the property name so that it looks just like
12466             # a modern PropList entry.
12467
12468             $_ =~ s/\s.*//;
12469             $_ .= "; $current_property";
12470         }
12471         trace $_ if main::DEBUG && $to_trace;
12472         return;
12473     }
12474 } # End closure for old style proplist
12475
12476 sub filter_old_style_normalization_lines {
12477     # For early releases of Unicode, the lines were like:
12478     #        74..2A76    ; NFKD_NO
12479     # For later releases this became:
12480     #        74..2A76    ; NFKD_QC; N
12481     # Filter $_ to look like those in later releases.
12482     # Similarly for MAYBEs
12483
12484     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
12485
12486     # Also, the property FC_NFKC was abbreviated to FNC
12487     s/FNC/FC_NFKC/;
12488     return;
12489 }
12490
12491 sub setup_script_extensions {
12492     # The Script_Extensions property starts out with a clone of the Script
12493     # property.
12494
12495     my $scx = property_ref("Script_Extensions");
12496     $scx = Property->new("scx", Full_Name => "Script_Extensions")
12497                                                             if ! defined $scx;
12498     $scx->_set_format($STRING_WHITE_SPACE_LIST);
12499     $scx->initialize($script);
12500     $scx->set_default_map($script->default_map);
12501     $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
12502     $scx->add_comment(join_lines( <<END
12503 The values for code points that appear in one script are just the same as for
12504 the 'Script' property.  Likewise the values for those that appear in many
12505 scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
12506 values of code points that appear in a few scripts are a space separated list
12507 of those scripts.
12508 END
12509     ));
12510
12511     # Initialize scx's tables and the aliases for them to be the same as sc's
12512     foreach my $table ($script->tables) {
12513         my $scx_table = $scx->add_match_table($table->name,
12514                                 Full_Name => $table->full_name);
12515         foreach my $alias ($table->aliases) {
12516             $scx_table->add_alias($alias->name);
12517         }
12518     }
12519 }
12520
12521 sub  filter_script_extensions_line {
12522     # The Scripts file comes with the full name for the scripts; the
12523     # ScriptExtensions, with the short name.  The final mapping file is a
12524     # combination of these, and without adjustment, would have inconsistent
12525     # entries.  This filters the latter file to convert to full names.
12526     # Entries look like this:
12527     # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
12528
12529     my @fields = split /\s*;\s*/;
12530
12531     # This script was erroneously omitted in this Unicode version.
12532     $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
12533
12534     my @full_names;
12535     foreach my $short_name (split " ", $fields[1]) {
12536         push @full_names, $script->table($short_name)->full_name;
12537     }
12538     $fields[1] = join " ", @full_names;
12539     $_ = join "; ", @fields;
12540
12541     return;
12542 }
12543
12544 sub generate_hst {
12545
12546     # Populates the Hangul Syllable Type property from first principles
12547
12548     my $file= shift;
12549     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12550
12551     # These few ranges are hard-coded in.
12552     $file->insert_lines(split /\n/, <<'END'
12553 1100..1159    ; L
12554 115F          ; L
12555 1160..11A2    ; V
12556 11A8..11F9    ; T
12557 END
12558 );
12559
12560     # The Hangul syllables in version 1 are completely different than what came
12561     # after, so just ignore them there.
12562     if ($v_version lt v2.0.0) {
12563         my $property = property_ref($file->property);
12564         push @tables_that_may_be_empty, $property->table('LV')->complete_name;
12565         push @tables_that_may_be_empty, $property->table('LVT')->complete_name;
12566         return;
12567     }
12568
12569     # The algorithmically derived syllables are almost all LVT ones, so
12570     # initialize the whole range with that.
12571     $file->insert_lines(sprintf "%04X..%04X; LVT\n",
12572                         $SBase, $SBase + $SCount -1);
12573
12574     # Those ones that aren't LVT are LV, and they occur at intervals of
12575     # $TCount code points, starting with the first code point, at $SBase.
12576     for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
12577         $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
12578     }
12579
12580     return;
12581 }
12582
12583 sub generate_GCB {
12584
12585     # Populates the Grapheme Cluster Break property from first principles
12586
12587     my $file= shift;
12588     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12589
12590     # All these definitions are from
12591     # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
12592     # from http://www.unicode.org/reports/tr29/tr29-4.html
12593
12594     foreach my $range ($gc->ranges) {
12595
12596         # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
12597         # and gc=Cf
12598         if ($range->value =~ / ^ M [en] $ /x) {
12599             $file->insert_lines(sprintf "%04X..%04X; Extend",
12600                                 $range->start,  $range->end);
12601         }
12602         elsif ($range->value =~ / ^ C [cf] $ /x) {
12603             $file->insert_lines(sprintf "%04X..%04X; Control",
12604                                 $range->start,  $range->end);
12605         }
12606     }
12607     $file->insert_lines("2028; Control"); # Line Separator
12608     $file->insert_lines("2029; Control"); # Paragraph Separator
12609
12610     $file->insert_lines("000D; CR");
12611     $file->insert_lines("000A; LF");
12612
12613     # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
12614     foreach my $code_point ( qw{
12615                                 40000
12616                                 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
12617                                 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
12618                                 }
12619     ) {
12620         my $category = $gc->value_of(hex $code_point);
12621         next if ! defined $category || $category eq 'Cn'; # But not if
12622                                                           # unassigned in this
12623                                                           # release
12624         $file->insert_lines("$code_point; Extend");
12625     }
12626
12627     my $hst = property_ref('Hangul_Syllable_Type');
12628     if ($hst->count > 0) {
12629         foreach my $range ($hst->ranges) {
12630             $file->insert_lines(sprintf "%04X..%04X; %s",
12631                                     $range->start, $range->end, $range->value);
12632         }
12633     }
12634     else {
12635         generate_hst($file);
12636     }
12637
12638     return;
12639 }
12640
12641 sub setup_early_name_alias {
12642     my $file= shift;
12643     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12644
12645     # This has the effect of pretending that the Name_Alias property was
12646     # available in all Unicode releases.  Strictly speaking, this property
12647     # should not be availabe in early releases, but doing this allows
12648     # charnames.pm to work on older releases without change.  Prior to v5.16
12649     # it had these names hard-coded inside it.  Unicode 6.1 came along and
12650     # created these names, and so they were removed from charnames.
12651
12652     my $aliases = property_ref('Name_Alias');
12653     if (! defined $aliases) {
12654         $aliases = Property->new('Name_Alias', Default_Map => "");
12655     }
12656
12657     $file->insert_lines(get_old_name_aliases());
12658
12659     return;
12660 }
12661
12662 sub get_old_name_aliases () {
12663
12664     # The Unicode_1_Name field, contains most of these names.  One would
12665     # expect, given the field's name, that its values would be fixed across
12666     # versions, giving the true Unicode version 1 name for the character.
12667     # Sadly, this is not the case.  Actually Version 1.1.5 had no names for
12668     # any of the controls; Version 2.0 introduced names for the C0 controls,
12669     # and 3.0 introduced C1 names.  3.0.1 removed the name INDEX; and 3.2
12670     # changed some names: it
12671     #   changed to parenthesized versions like "NEXT LINE" to
12672     #       "NEXT LINE (NEL)";
12673     #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
12674     #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
12675     #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
12676     # This list contains all the names that were defined so that
12677     # charnames::vianame(), etc. understand them all EVEN if this version of
12678     # Unicode didn't specify them (this could be construed as a bug).
12679     # mktables elsewhere gives preference to the Unicode_1_Name field over
12680     # these names, so that viacode() will return the correct value for that
12681     # version of Unicode, except when that version doesn't define a name,
12682     # viacode() will return one anyway (this also could be construed as a
12683     # bug).  But these potential "bugs" allow for the smooth working of code
12684     # on earlier Unicode releases.
12685
12686     my @return = split /\n/, <<'END';
12687 0000;NULL;control
12688 0000;NUL;abbreviation
12689 0001;START OF HEADING;control
12690 0001;SOH;abbreviation
12691 0002;START OF TEXT;control
12692 0002;STX;abbreviation
12693 0003;END OF TEXT;control
12694 0003;ETX;abbreviation
12695 0004;END OF TRANSMISSION;control
12696 0004;EOT;abbreviation
12697 0005;ENQUIRY;control
12698 0005;ENQ;abbreviation
12699 0006;ACKNOWLEDGE;control
12700 0006;ACK;abbreviation
12701 0007;BELL;control
12702 0007;BEL;abbreviation
12703 0008;BACKSPACE;control
12704 0008;BS;abbreviation
12705 0009;CHARACTER TABULATION;control
12706 0009;HORIZONTAL TABULATION;control
12707 0009;HT;abbreviation
12708 0009;TAB;abbreviation
12709 000A;LINE FEED;control
12710 000A;LINE FEED (LF);control
12711 000A;NEW LINE;control
12712 000A;END OF LINE;control
12713 000A;LF;abbreviation
12714 000A;NL;abbreviation
12715 000A;EOL;abbreviation
12716 000B;LINE TABULATION;control
12717 000B;VERTICAL TABULATION;control
12718 000B;VT;abbreviation
12719 000C;FORM FEED;control
12720 000C;FORM FEED (FF);control
12721 000C;FF;abbreviation
12722 000D;CARRIAGE RETURN;control
12723 000D;CARRIAGE RETURN (CR);control
12724 000D;CR;abbreviation
12725 000E;SHIFT OUT;control
12726 000E;LOCKING-SHIFT ONE;control
12727 000E;SO;abbreviation
12728 000F;SHIFT IN;control
12729 000F;LOCKING-SHIFT ZERO;control
12730 000F;SI;abbreviation
12731 0010;DATA LINK ESCAPE;control
12732 0010;DLE;abbreviation
12733 0011;DEVICE CONTROL ONE;control
12734 0011;DC1;abbreviation
12735 0012;DEVICE CONTROL TWO;control
12736 0012;DC2;abbreviation
12737 0013;DEVICE CONTROL THREE;control
12738 0013;DC3;abbreviation
12739 0014;DEVICE CONTROL FOUR;control
12740 0014;DC4;abbreviation
12741 0015;NEGATIVE ACKNOWLEDGE;control
12742 0015;NAK;abbreviation
12743 0016;SYNCHRONOUS IDLE;control
12744 0016;SYN;abbreviation
12745 0017;END OF TRANSMISSION BLOCK;control
12746 0017;ETB;abbreviation
12747 0018;CANCEL;control
12748 0018;CAN;abbreviation
12749 0019;END OF MEDIUM;control
12750 0019;EOM;abbreviation
12751 001A;SUBSTITUTE;control
12752 001A;SUB;abbreviation
12753 001B;ESCAPE;control
12754 001B;ESC;abbreviation
12755 001C;INFORMATION SEPARATOR FOUR;control
12756 001C;FILE SEPARATOR;control
12757 001C;FS;abbreviation
12758 001D;INFORMATION SEPARATOR THREE;control
12759 001D;GROUP SEPARATOR;control
12760 001D;GS;abbreviation
12761 001E;INFORMATION SEPARATOR TWO;control
12762 001E;RECORD SEPARATOR;control
12763 001E;RS;abbreviation
12764 001F;INFORMATION SEPARATOR ONE;control
12765 001F;UNIT SEPARATOR;control
12766 001F;US;abbreviation
12767 0020;SP;abbreviation
12768 007F;DELETE;control
12769 007F;DEL;abbreviation
12770 0080;PADDING CHARACTER;figment
12771 0080;PAD;abbreviation
12772 0081;HIGH OCTET PRESET;figment
12773 0081;HOP;abbreviation
12774 0082;BREAK PERMITTED HERE;control
12775 0082;BPH;abbreviation
12776 0083;NO BREAK HERE;control
12777 0083;NBH;abbreviation
12778 0084;INDEX;control
12779 0084;IND;abbreviation
12780 0085;NEXT LINE;control
12781 0085;NEXT LINE (NEL);control
12782 0085;NEL;abbreviation
12783 0086;START OF SELECTED AREA;control
12784 0086;SSA;abbreviation
12785 0087;END OF SELECTED AREA;control
12786 0087;ESA;abbreviation
12787 0088;CHARACTER TABULATION SET;control
12788 0088;HORIZONTAL TABULATION SET;control
12789 0088;HTS;abbreviation
12790 0089;CHARACTER TABULATION WITH JUSTIFICATION;control
12791 0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
12792 0089;HTJ;abbreviation
12793 008A;LINE TABULATION SET;control
12794 008A;VERTICAL TABULATION SET;control
12795 008A;VTS;abbreviation
12796 008B;PARTIAL LINE FORWARD;control
12797 008B;PARTIAL LINE DOWN;control
12798 008B;PLD;abbreviation
12799 008C;PARTIAL LINE BACKWARD;control
12800 008C;PARTIAL LINE UP;control
12801 008C;PLU;abbreviation
12802 008D;REVERSE LINE FEED;control
12803 008D;REVERSE INDEX;control
12804 008D;RI;abbreviation
12805 008E;SINGLE SHIFT TWO;control
12806 008E;SINGLE-SHIFT-2;control
12807 008E;SS2;abbreviation
12808 008F;SINGLE SHIFT THREE;control
12809 008F;SINGLE-SHIFT-3;control
12810 008F;SS3;abbreviation
12811 0090;DEVICE CONTROL STRING;control
12812 0090;DCS;abbreviation
12813 0091;PRIVATE USE ONE;control
12814 0091;PRIVATE USE-1;control
12815 0091;PU1;abbreviation
12816 0092;PRIVATE USE TWO;control
12817 0092;PRIVATE USE-2;control
12818 0092;PU2;abbreviation
12819 0093;SET TRANSMIT STATE;control
12820 0093;STS;abbreviation
12821 0094;CANCEL CHARACTER;control
12822 0094;CCH;abbreviation
12823 0095;MESSAGE WAITING;control
12824 0095;MW;abbreviation
12825 0096;START OF GUARDED AREA;control
12826 0096;START OF PROTECTED AREA;control
12827 0096;SPA;abbreviation
12828 0097;END OF GUARDED AREA;control
12829 0097;END OF PROTECTED AREA;control
12830 0097;EPA;abbreviation
12831 0098;START OF STRING;control
12832 0098;SOS;abbreviation
12833 0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
12834 0099;SGC;abbreviation
12835 009A;SINGLE CHARACTER INTRODUCER;control
12836 009A;SCI;abbreviation
12837 009B;CONTROL SEQUENCE INTRODUCER;control
12838 009B;CSI;abbreviation
12839 009C;STRING TERMINATOR;control
12840 009C;ST;abbreviation
12841 009D;OPERATING SYSTEM COMMAND;control
12842 009D;OSC;abbreviation
12843 009E;PRIVACY MESSAGE;control
12844 009E;PM;abbreviation
12845 009F;APPLICATION PROGRAM COMMAND;control
12846 009F;APC;abbreviation
12847 00A0;NBSP;abbreviation
12848 00AD;SHY;abbreviation
12849 200B;ZWSP;abbreviation
12850 200C;ZWNJ;abbreviation
12851 200D;ZWJ;abbreviation
12852 200E;LRM;abbreviation
12853 200F;RLM;abbreviation
12854 202A;LRE;abbreviation
12855 202B;RLE;abbreviation
12856 202C;PDF;abbreviation
12857 202D;LRO;abbreviation
12858 202E;RLO;abbreviation
12859 FEFF;BYTE ORDER MARK;alternate
12860 FEFF;BOM;abbreviation
12861 FEFF;ZWNBSP;abbreviation
12862 END
12863
12864     if ($v_version ge v3.0.0) {
12865         push @return, split /\n/, <<'END';
12866 180B; FVS1; abbreviation
12867 180C; FVS2; abbreviation
12868 180D; FVS3; abbreviation
12869 180E; MVS; abbreviation
12870 202F; NNBSP; abbreviation
12871 END
12872     }
12873
12874     if ($v_version ge v3.2.0) {
12875         push @return, split /\n/, <<'END';
12876 034F; CGJ; abbreviation
12877 205F; MMSP; abbreviation
12878 2060; WJ; abbreviation
12879 END
12880         # Add in VS1..VS16
12881         my $cp = 0xFE00 - 1;
12882         for my $i (1..16) {
12883             push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12884         }
12885     }
12886     if ($v_version ge v4.0.0) { # Add in VS17..VS256
12887         my $cp = 0xE0100 - 17;
12888         for my $i (17..256) {
12889             push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12890         }
12891     }
12892
12893     # ALERT did not come along until 6.0, at which point it became preferred
12894     # over BELL, and was never in the Unicode_1_Name field.  For the same
12895     # reasons, that the other names are made known to all releases by this
12896     # function, we make ALERT known too.  By inserting it
12897     # last in early releases, BELL is preferred over it; and vice-vers in 6.0
12898     my $alert = '0007; ALERT; control';
12899     if ($v_version lt v6.0.0) {
12900         push @return, $alert;
12901     }
12902     else {
12903         unshift @return, $alert;
12904     }
12905
12906     return @return;
12907 }
12908
12909 sub filter_later_version_name_alias_line {
12910
12911     # This file has an extra entry per line for the alias type.  This is
12912     # handled by creating a compound entry: "$alias: $type";  First, split
12913     # the line into components.
12914     my ($range, $alias, $type, @remainder)
12915         = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12916
12917     # This file contains multiple entries for some components, so tell the
12918     # downstream code to allow this in our internal tables; the
12919     # $MULTIPLE_AFTER preserves the input ordering.
12920     $_ = join ";", $range, $CMD_DELIM
12921                            . $REPLACE_CMD
12922                            . '='
12923                            . $MULTIPLE_AFTER
12924                            . $CMD_DELIM
12925                            . "$alias: $type",
12926                    @remainder;
12927     return;
12928 }
12929
12930 sub filter_early_version_name_alias_line {
12931
12932     # Early versions did not have the trailing alias type field; implicitly it
12933     # was 'correction'.   But our synthetic lines we add in this program do
12934     # have it, so test for the type field.
12935     $_ .= "; correction" if $_ !~ /;.*;/;
12936
12937     filter_later_version_name_alias_line;
12938     return;
12939 }
12940
12941 sub finish_Unicode() {
12942     # This routine should be called after all the Unicode files have been read
12943     # in.  It:
12944     # 1) Creates properties that are missing from the version of Unicode being
12945     #    compiled, and which, for whatever reason, are needed for the Perl
12946     #    core to function properly.  These are minimally populated as
12947     #    necessary.
12948     # 2) Adds the mappings for code points missing from the files which have
12949     #    defaults specified for them.
12950     # 3) At this this point all mappings are known, so it computes the type of
12951     #    each property whose type hasn't been determined yet.
12952     # 4) Calculates all the regular expression match tables based on the
12953     #    mappings.
12954     # 5) Calculates and adds the tables which are defined by Unicode, but
12955     #    which aren't derived by them, and certain derived tables that Perl
12956     #    uses.
12957
12958     # Folding information was introduced later into Unicode data.  To get
12959     # Perl's case ignore (/i) to work at all in releases that don't have
12960     # folding, use the best available alternative, which is lower casing.
12961     my $fold = property_ref('Case_Folding');
12962     if ($fold->is_empty) {
12963         $fold->initialize(property_ref('Lowercase_Mapping'));
12964         $fold->add_note(join_lines(<<END
12965 WARNING: This table uses lower case as a substitute for missing fold
12966 information
12967 END
12968         ));
12969     }
12970
12971     # Multiple-character mapping was introduced later into Unicode data, so it
12972     # is by default the simple version.  If to output the simple versions and
12973     # not present, just use the regular (which in these Unicode versions is
12974     # the simple as well).
12975     foreach my $map (qw {   Uppercase_Mapping
12976                             Lowercase_Mapping
12977                             Titlecase_Mapping
12978                             Case_Folding
12979                         } )
12980     {
12981         my $comment = <<END;
12982
12983 Note that although the Perl core uses this file, it has the standard values
12984 for code points from U+0000 to U+00FF compiled in, so changing this table will
12985 not change the core's behavior with respect to these code points.  Use
12986 Unicode::Casing to override this table.
12987 END
12988         if ($map eq 'Case_Folding') {
12989             $comment .= <<END;
12990 (/i regex matching is not overridable except by using a custom regex engine)
12991 END
12992         }
12993         property_ref($map)->add_comment(join_lines($comment));
12994         my $simple = property_ref("Simple_$map");
12995         next if ! $simple->is_empty;
12996         if ($simple->to_output_map) {
12997             $simple->initialize(property_ref($map));
12998         }
12999         else {
13000             property_ref($map)->set_proxy_for($simple->name);
13001         }
13002     }
13003
13004     # For each property, fill in any missing mappings, and calculate the re
13005     # match tables.  If a property has more than one missing mapping, the
13006     # default is a reference to a data structure, and requires data from other
13007     # properties to resolve.  The sort is used to cause these to be processed
13008     # last, after all the other properties have been calculated.
13009     # (Fortunately, the missing properties so far don't depend on each other.)
13010     foreach my $property
13011         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13012         property_ref('*'))
13013     {
13014         # $perl has been defined, but isn't one of the Unicode properties that
13015         # need to be finished up.
13016         next if $property == $perl;
13017
13018         # Nor do we need to do anything with properties that aren't going to
13019         # be output.
13020         next if $property->fate == $SUPPRESSED;
13021
13022         # Handle the properties that have more than one possible default
13023         if (ref $property->default_map) {
13024             my $default_map = $property->default_map;
13025
13026             # These properties have stored in the default_map:
13027             # One or more of:
13028             #   1)  A default map which applies to all code points in a
13029             #       certain class
13030             #   2)  an expression which will evaluate to the list of code
13031             #       points in that class
13032             # And
13033             #   3) the default map which applies to every other missing code
13034             #      point.
13035             #
13036             # Go through each list.
13037             while (my ($default, $eval) = $default_map->get_next_defaults) {
13038
13039                 # Get the class list, and intersect it with all the so-far
13040                 # unspecified code points yielding all the code points
13041                 # in the class that haven't been specified.
13042                 my $list = eval $eval;
13043                 if ($@) {
13044                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13045                     last;
13046                 }
13047
13048                 # Narrow down the list to just those code points we don't have
13049                 # maps for yet.
13050                 $list = $list & $property->inverse_list;
13051
13052                 # Add mappings to the property for each code point in the list
13053                 foreach my $range ($list->ranges) {
13054                     $property->add_map($range->start, $range->end, $default,
13055                     Replace => $CROAK);
13056                 }
13057             }
13058
13059             # All remaining code points have the other mapping.  Set that up
13060             # so the normal single-default mapping code will work on them
13061             $property->set_default_map($default_map->other_default);
13062
13063             # And fall through to do that
13064         }
13065
13066         # We should have enough data now to compute the type of the property.
13067         my $property_name = $property->name;
13068         $property->compute_type;
13069         my $property_type = $property->type;
13070
13071         next if ! $property->to_create_match_tables;
13072
13073         # Here want to create match tables for this property
13074
13075         # The Unicode db always (so far, and they claim into the future) have
13076         # the default for missing entries in binary properties be 'N' (unless
13077         # there is a '@missing' line that specifies otherwise)
13078         if (! defined $property->default_map) {
13079             if ($property_type == $BINARY) {
13080                 $property->set_default_map('N');
13081             }
13082             elsif ($property_type == $ENUM) {
13083                 Carp::my_carp("Property '$property_name doesn't have a default mapping.  Using a fake one");
13084                 $property->set_default_map('XXX This makes sure there is a default map');
13085             }
13086         }
13087
13088         # Add any remaining code points to the mapping, using the default for
13089         # missing code points.
13090         my $default_table;
13091         if (defined (my $default_map = $property->default_map)) {
13092
13093             # Make sure there is a match table for the default
13094             if (! defined ($default_table = $property->table($default_map))) {
13095                 $default_table = $property->add_match_table($default_map);
13096             }
13097
13098             # And, if the property is binary, the default table will just
13099             # be the complement of the other table.
13100             if ($property_type == $BINARY) {
13101                 my $non_default_table;
13102
13103                 # Find the non-default table.
13104                 for my $table ($property->tables) {
13105                     next if $table == $default_table;
13106                     $non_default_table = $table;
13107                 }
13108                 $default_table->set_complement($non_default_table);
13109             }
13110             else {
13111
13112                 # This fills in any missing values with the default.  It's not
13113                 # necessary to do this with binary properties, as the default
13114                 # is defined completely in terms of the Y table.
13115                 $property->add_map(0, $MAX_WORKING_CODEPOINT,
13116                                    $default_map, Replace => $NO);
13117             }
13118         }
13119
13120         # Have all we need to populate the match tables.
13121         my $maps_should_be_defined = $property->pre_declared_maps;
13122         foreach my $range ($property->ranges) {
13123             my $map = $range->value;
13124             my $table = $property->table($map);
13125             if (! defined $table) {
13126
13127                 # Integral and rational property values are not necessarily
13128                 # defined in PropValueAliases, but whether all the other ones
13129                 # should be depends on the property.
13130                 if ($maps_should_be_defined
13131                     && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13132                 {
13133                     Carp::my_carp("Table '$property_name=$map' should have been defined.  Defining it now.")
13134                 }
13135                 $table = $property->add_match_table($map);
13136             }
13137
13138             next if $table->complement != 0;    # Don't need to populate these
13139             $table->add_range($range->start, $range->end);
13140         }
13141
13142         # A forced binary property has additional true/false tables which
13143         # should have been set up when it was forced into binary.  The false
13144         # table matches exactly the same set as the property's default table.
13145         # The true table matches the complement of that.  The false table is
13146         # not the same as an additional set of aliases on top of the default
13147         # table, so use 'set_equivalent_to'.  If it were implemented as
13148         # additional aliases, various things would have to be adjusted, but
13149         # especially, if the user wants to get a list of names for the table
13150         # using Unicode::UCD::prop_value_aliases(), s/he should get a
13151         # different set depending on whether they want the default table or
13152         # the false table.
13153         if ($property_type == $FORCED_BINARY) {
13154             $property->table('N')->set_equivalent_to($default_table,
13155                                                      Related => 1);
13156             $property->table('Y')->set_complement($default_table);
13157         }
13158
13159         # For Perl 5.6 compatibility, all properties matchable in regexes can
13160         # have an optional 'Is_' prefix.  This is now done in utf8_heavy.pl.
13161         # But warn if this creates a conflict with a (new) Unicode property
13162         # name, although it appears that Unicode has made a decision never to
13163         # begin a property name with 'Is_', so this shouldn't happen.
13164         foreach my $alias ($property->aliases) {
13165             my $Is_name = 'Is_' . $alias->name;
13166             if (defined (my $pre_existing = property_ref($Is_name))) {
13167                 Carp::my_carp(<<END
13168 There is already an alias named $Is_name (from " . $pre_existing . "), so
13169 creating one for $property won't work.  This is bad news.  If it is not too
13170 late, get Unicode to back off.  Otherwise go back to the old scheme (findable
13171 from the git blame log for this area of the code that suppressed individual
13172 aliases that conflict with the new Unicode names.  Proceeding anyway.
13173 END
13174                 );
13175             }
13176         } # End of loop through aliases for this property
13177     } # End of loop through all Unicode properties.
13178
13179     # Fill in the mappings that Unicode doesn't completely furnish.  First the
13180     # single letter major general categories.  If Unicode were to start
13181     # delivering the values, this would be redundant, but better that than to
13182     # try to figure out if should skip and not get it right.  Ths could happen
13183     # if a new major category were to be introduced, and the hard-coded test
13184     # wouldn't know about it.
13185     # This routine depends on the standard names for the general categories
13186     # being what it thinks they are, like 'Cn'.  The major categories are the
13187     # union of all the general category tables which have the same first
13188     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13189     foreach my $minor_table ($gc->tables) {
13190         my $minor_name = $minor_table->name;
13191         next if length $minor_name == 1;
13192         if (length $minor_name != 2) {
13193             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
13194             next;
13195         }
13196
13197         my $major_name = uc(substr($minor_name, 0, 1));
13198         my $major_table = $gc->table($major_name);
13199         $major_table += $minor_table;
13200     }
13201
13202     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
13203     # defines it as LC)
13204     my $LC = $gc->table('LC');
13205     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
13206     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
13207
13208
13209     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
13210                          # deliver the correct values in it
13211         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
13212
13213         # Lt not in release 1.
13214         if (defined $gc->table('Lt')) {
13215             $LC += $gc->table('Lt');
13216             $gc->table('Lt')->set_caseless_equivalent($LC);
13217         }
13218     }
13219     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
13220
13221     $gc->table('Ll')->set_caseless_equivalent($LC);
13222     $gc->table('Lu')->set_caseless_equivalent($LC);
13223
13224     my $Cs = $gc->table('Cs');
13225
13226     # Create digit and case fold tables with the original file names for
13227     # backwards compatibility with applications that read them directly.
13228     my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
13229                               Default_Map => "",
13230                               File => 'Digit',    # Trad. location
13231                               Directory => $map_directory,
13232                               Type => $STRING,
13233                               Replacement_Property => "Perl_Decimal_Digit",
13234                               Initialize => property_ref('Perl_Decimal_Digit'),
13235                             );
13236     $Digit->add_comment(join_lines(<<END
13237 This file gives the mapping of all code points which represent a single
13238 decimal digit [0-9] to their respective digits.  For example, the code point
13239 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
13240 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
13241 numerals.
13242 END
13243     ));
13244
13245     Property->new('Legacy_Case_Folding',
13246                     File => "Fold",
13247                     Directory => $map_directory,
13248                     Default_Map => $CODE_POINT,
13249                     Type => $STRING,
13250                     Replacement_Property => "Case_Folding",
13251                     Format => $HEX_FORMAT,
13252                     Initialize => property_ref('cf'),
13253     );
13254
13255     # The Script_Extensions property started out as a clone of the Script
13256     # property.  But processing its data file caused some elements to be
13257     # replaced with different data.  (These elements were for the Common and
13258     # Inherited properties.)  This data is a qw() list of all the scripts that
13259     # the code points in the given range are in.  An example line is:
13260     # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
13261     #
13262     # The code above has created a new match table named "Arab Syrc Thaa"
13263     # which contains 060C.  (The cloned table started out with this code point
13264     # mapping to "Common".)  Now we add 060C to each of the Arab, Syrc, and
13265     # Thaa match tables.  Then we delete the now spurious "Arab Syrc Thaa"
13266     # match table.  This is repeated for all these tables and ranges.  The map
13267     # data is retained in the map table for reference, but the spurious match
13268     # tables are deleted.
13269
13270     my $scx = property_ref("Script_Extensions");
13271     if (defined $scx) {
13272         foreach my $table ($scx->tables) {
13273             next unless $table->name =~ /\s/;   # All the new and only the new
13274                                                 # tables have a space in their
13275                                                 # names
13276             my @scripts = split /\s+/, $table->name;
13277             foreach my $script (@scripts) {
13278                 my $script_table = $scx->table($script);
13279                 $script_table += $table;
13280             }
13281             $scx->delete_match_table($table);
13282         }
13283     }
13284
13285     return;
13286 }
13287
13288 sub pre_3_dot_1_Nl () {
13289
13290     # Return a range list for gc=nl for Unicode versions prior to 3.1, which
13291     # is when Unicode's became fully usable.  These code points were
13292     # determined by inspection and experimentation.  gc=nl is important for
13293     # certain Perl-extension properties that should be available in all
13294     # releases.
13295
13296     my $Nl = Range_List->new();
13297     if (defined (my $official = $gc->table('Nl'))) {
13298         $Nl += $official;
13299     }
13300     else {
13301         $Nl->add_range(0x2160, 0x2182);
13302         $Nl->add_range(0x3007, 0x3007);
13303         $Nl->add_range(0x3021, 0x3029);
13304     }
13305     $Nl->add_range(0xFE20, 0xFE23);
13306     $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
13307                                                             # these were added
13308     return $Nl;
13309 }
13310
13311 sub compile_perl() {
13312     # Create perl-defined tables.  Almost all are part of the pseudo-property
13313     # named 'perl' internally to this program.  Many of these are recommended
13314     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
13315     # on those found there.
13316     # Almost all of these are equivalent to some Unicode property.
13317     # A number of these properties have equivalents restricted to the ASCII
13318     # range, with their names prefaced by 'Posix', to signify that these match
13319     # what the Posix standard says they should match.  A couple are
13320     # effectively this, but the name doesn't have 'Posix' in it because there
13321     # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
13322     # to the full Unicode range, by our guesses as to what is appropriate.
13323
13324     # 'All' is all code points.  As an error check, instead of just setting it
13325     # to be that, construct it to be the union of all the major categories
13326     $All = $perl->add_match_table('All',
13327       Description
13328         => "All code points, including those above Unicode.  Same as qr/./s",
13329       Matches_All => 1);
13330
13331     foreach my $major_table ($gc->tables) {
13332
13333         # Major categories are the ones with single letter names.
13334         next if length($major_table->name) != 1;
13335
13336         $All += $major_table;
13337     }
13338
13339     if ($All->max != $MAX_WORKING_CODEPOINT) {
13340         Carp::my_carp_bug("Generated highest code point ("
13341            . sprintf("%X", $All->max)
13342            . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
13343     }
13344     if ($All->range_count != 1 || $All->min != 0) {
13345      Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
13346     }
13347
13348     my $Any = $perl->add_match_table('Any',
13349                                      Description  => "All Unicode code points: [\\x{0000}-\\x{10FFFF}]",
13350                                      );
13351     $Any->add_range(0, 0x10FFFF);
13352     $Any->add_alias('Unicode');
13353
13354     # Assigned is the opposite of gc=unassigned
13355     my $Assigned = $perl->add_match_table('Assigned',
13356                                 Description  => "All assigned code points",
13357                                 Initialize => ~ $gc->table('Unassigned'),
13358                                 );
13359
13360     # Our internal-only property should be treated as more than just a
13361     # synonym; grandfather it in to the pod.
13362     $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
13363                             Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
13364             ->set_equivalent_to(property_ref('ccc')->table('Above'),
13365                                                                 Related => 1);
13366
13367     my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
13368     if (defined $block) {   # This is equivalent to the block if have it.
13369         my $Unicode_ASCII = $block->table('Basic_Latin');
13370         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
13371             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
13372         }
13373     }
13374
13375     # Very early releases didn't have blocks, so initialize ASCII ourselves if
13376     # necessary
13377     if ($ASCII->is_empty) {
13378         if (! NON_ASCII_PLATFORM) {
13379             $ASCII->add_range(0, 127);
13380         }
13381         else {
13382             for my $i (0 .. 127) {
13383                 $ASCII->add_range(utf8::unicode_to_native($i),
13384                                   utf8::unicode_to_native($i));
13385             }
13386         }
13387     }
13388
13389     # Get the best available case definitions.  Early Unicode versions didn't
13390     # have Uppercase and Lowercase defined, so use the general category
13391     # instead for them, modified by hard-coding in the code points each is
13392     # missing.
13393     my $Lower = $perl->add_match_table('Lower');
13394     my $Unicode_Lower = property_ref('Lowercase');
13395     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
13396         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
13397
13398     }
13399     else {
13400         $Lower += $gc->table('Lowercase_Letter');
13401
13402         # There are quite a few code points in Lower, that aren't in gc=lc,
13403         # and not all are in all releases.
13404         foreach my $code_point (    utf8::unicode_to_native(0xAA),
13405                                     utf8::unicode_to_native(0xBA),
13406                                     0x02B0 .. 0x02B8,
13407                                     0x02C0 .. 0x02C1,
13408                                     0x02E0 .. 0x02E4,
13409                                     0x0345,
13410                                     0x037A,
13411                                     0x1D2C .. 0x1D6A,
13412                                     0x1D78,
13413                                     0x1D9B .. 0x1DBF,
13414                                     0x2071,
13415                                     0x207F,
13416                                     0x2090 .. 0x209C,
13417                                     0x2170 .. 0x217F,
13418                                     0x24D0 .. 0x24E9,
13419                                     0x2C7C .. 0x2C7D,
13420                                     0xA770,
13421                                     0xA7F8 .. 0xA7F9,
13422         ) {
13423             # Don't include the code point unless it is assigned in this
13424             # release
13425             my $category = $gc->value_of(hex $code_point);
13426             next if ! defined $category || $category eq 'Cn';
13427
13428             $Lower += $code_point;
13429         }
13430     }
13431     $Lower->add_alias('XPosixLower');
13432     my $Posix_Lower = $perl->add_match_table("PosixLower",
13433                             Description => "[a-z]",
13434                             Initialize => $Lower & $ASCII,
13435                             );
13436
13437     my $Upper = $perl->add_match_table('Upper');
13438     my $Unicode_Upper = property_ref('Uppercase');
13439     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
13440         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
13441     }
13442     else {
13443
13444         # Unlike Lower, there are only two ranges in Upper that aren't in
13445         # gc=Lu, and all code points were assigned in all releases.
13446         $Upper += $gc->table('Uppercase_Letter');
13447         $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
13448         $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
13449     }
13450     $Upper->add_alias('XPosixUpper');
13451     my $Posix_Upper = $perl->add_match_table("PosixUpper",
13452                             Description => "[A-Z]",
13453                             Initialize => $Upper & $ASCII,
13454                             );
13455
13456     # Earliest releases didn't have title case.  Initialize it to empty if not
13457     # otherwise present
13458     my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
13459                                        Description => '(= \p{Gc=Lt})');
13460     my $lt = $gc->table('Lt');
13461
13462     # Earlier versions of mktables had this related to $lt since they have
13463     # identical code points, but their caseless equivalents are not the same,
13464     # one being 'Cased' and the other being 'LC', and so now must be kept as
13465     # separate entities.
13466     if (defined $lt) {
13467         $Title += $lt;
13468     }
13469     else {
13470         push @tables_that_may_be_empty, $Title->complete_name;
13471     }
13472
13473     my $Unicode_Cased = property_ref('Cased');
13474     if (defined $Unicode_Cased) {
13475         my $yes = $Unicode_Cased->table('Y');
13476         my $no = $Unicode_Cased->table('N');
13477         $Title->set_caseless_equivalent($yes);
13478         if (defined $Unicode_Upper) {
13479             $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
13480             $Unicode_Upper->table('N')->set_caseless_equivalent($no);
13481         }
13482         $Upper->set_caseless_equivalent($yes);
13483         if (defined $Unicode_Lower) {
13484             $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
13485             $Unicode_Lower->table('N')->set_caseless_equivalent($no);
13486         }
13487         $Lower->set_caseless_equivalent($yes);
13488     }
13489     else {
13490         # If this Unicode version doesn't have Cased, set up the Perl
13491         # extension from first principles.  From Unicode 5.1: Definition D120:
13492         # A character C is defined to be cased if and only if C has the
13493         # Lowercase or Uppercase property or has a General_Category value of
13494         # Titlecase_Letter.
13495         my $cased = $perl->add_match_table('Cased',
13496                         Initialize => $Lower + $Upper + $Title,
13497                         Description => 'Uppercase or Lowercase or Titlecase',
13498                         );
13499         # $notcased is purely for the caseless equivalents below
13500         my $notcased = $perl->add_match_table('_Not_Cased',
13501                                 Initialize => ~ $cased,
13502                                 Fate => $INTERNAL_ONLY,
13503                                 Description => 'All not-cased code points');
13504         $Title->set_caseless_equivalent($cased);
13505         if (defined $Unicode_Upper) {
13506             $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
13507             $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
13508         }
13509         $Upper->set_caseless_equivalent($cased);
13510         if (defined $Unicode_Lower) {
13511             $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
13512             $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
13513         }
13514         $Lower->set_caseless_equivalent($cased);
13515     }
13516
13517     # Similarly, set up our own Case_Ignorable property if this Unicode
13518     # version doesn't have it.  From Unicode 5.1: Definition D121: A character
13519     # C is defined to be case-ignorable if C has the value MidLetter or the
13520     # value MidNumLet for the Word_Break property or its General_Category is
13521     # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
13522     # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
13523
13524     # Perl has long had an internal-only alias for this property; grandfather
13525     # it in to the pod, but discourage its use.
13526     my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
13527                                                      Re_Pod_Entry => 1,
13528                                                      Fate => $INTERNAL_ONLY,
13529                                                      Status => $DISCOURAGED);
13530     my $case_ignorable = property_ref('Case_Ignorable');
13531     if (defined $case_ignorable && ! $case_ignorable->is_empty) {
13532         $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
13533                                                                 Related => 1);
13534     }
13535     else {
13536
13537         $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
13538
13539         # The following three properties are not in early releases
13540         $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
13541         $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
13542         $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
13543
13544         # For versions 4.1 - 5.0, there is no MidNumLet property, and
13545         # correspondingly the case-ignorable definition lacks that one.  For
13546         # 4.0, it appears that it was meant to be the same definition, but was
13547         # inadvertently omitted from the standard's text, so add it if the
13548         # property actually is there
13549         my $wb = property_ref('Word_Break');
13550         if (defined $wb) {
13551             my $midlet = $wb->table('MidLetter');
13552             $perl_case_ignorable += $midlet if defined $midlet;
13553             my $midnumlet = $wb->table('MidNumLet');
13554             $perl_case_ignorable += $midnumlet if defined $midnumlet;
13555         }
13556         else {
13557
13558             # In earlier versions of the standard, instead of the above two
13559             # properties , just the following characters were used:
13560             $perl_case_ignorable +=
13561                             ord("'")
13562                         +   utf8::unicode_to_native(0xAD)  # SOFT HYPHEN (SHY)
13563                         +   0x2019; # RIGHT SINGLE QUOTATION MARK
13564         }
13565     }
13566
13567     # The remaining perl defined tables are mostly based on Unicode TR 18,
13568     # "Annex C: Compatibility Properties".  All of these have two versions,
13569     # one whose name generally begins with Posix that is posix-compliant, and
13570     # one that matches Unicode characters beyond the Posix, ASCII range
13571
13572     my $Alpha = $perl->add_match_table('Alpha');
13573
13574     # Alphabetic was not present in early releases
13575     my $Alphabetic = property_ref('Alphabetic');
13576     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
13577         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
13578     }
13579     else {
13580
13581         # The Alphabetic property doesn't exist for early releases, so
13582         # generate it.  The actual definition, in 5.2 terms is:
13583         #
13584         # gc=L + gc=Nl + Other_Alphabetic
13585         #
13586         # Other_Alphabetic is also not defined in these early releases, but it
13587         # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
13588         # those last two as well, then subtract the relatively few of them that
13589         # shouldn't have been added.  (The gc=So range is the circled capital
13590         # Latin characters.  Early releases mistakenly didn't also include the
13591         # lower-case versions of these characters, and so we don't either, to
13592         # maintain consistency with those releases that first had this
13593         # property.
13594         $Alpha->initialize($gc->table('Letter')
13595                            + pre_3_dot_1_Nl()
13596                            + $gc->table('Mn')
13597                            + $gc->table('Mc')
13598                         );
13599         $Alpha->add_range(0x24D0, 0x24E9);  # gc=So
13600         foreach my $range (     [ 0x0300, 0x0344 ],
13601                                 [ 0x0346, 0x034E ],
13602                                 [ 0x0360, 0x0362 ],
13603                                 [ 0x0483, 0x0486 ],
13604                                 [ 0x0591, 0x05AF ],
13605                                 [ 0x06DF, 0x06E0 ],
13606                                 [ 0x06EA, 0x06EC ],
13607                                 [ 0x0740, 0x074A ],
13608                                 0x093C,
13609                                 0x094D,
13610                                 [ 0x0951, 0x0954 ],
13611                                 0x09BC,
13612                                 0x09CD,
13613                                 0x0A3C,
13614                                 0x0A4D,
13615                                 0x0ABC,
13616                                 0x0ACD,
13617                                 0x0B3C,
13618                                 0x0B4D,
13619                                 0x0BCD,
13620                                 0x0C4D,
13621                                 0x0CCD,
13622                                 0x0D4D,
13623                                 0x0DCA,
13624                                 [ 0x0E47, 0x0E4C ],
13625                                 0x0E4E,
13626                                 [ 0x0EC8, 0x0ECC ],
13627                                 [ 0x0F18, 0x0F19 ],
13628                                 0x0F35,
13629                                 0x0F37,
13630                                 0x0F39,
13631                                 [ 0x0F3E, 0x0F3F ],
13632                                 [ 0x0F82, 0x0F84 ],
13633                                 [ 0x0F86, 0x0F87 ],
13634                                 0x0FC6,
13635                                 0x1037,
13636                                 0x1039,
13637                                 [ 0x17C9, 0x17D3 ],
13638                                 [ 0x20D0, 0x20DC ],
13639                                 0x20E1,
13640                                 [ 0x302A, 0x302F ],
13641                                 [ 0x3099, 0x309A ],
13642                                 [ 0xFE20, 0xFE23 ],
13643                                 [ 0x1D165, 0x1D169 ],
13644                                 [ 0x1D16D, 0x1D172 ],
13645                                 [ 0x1D17B, 0x1D182 ],
13646                                 [ 0x1D185, 0x1D18B ],
13647                                 [ 0x1D1AA, 0x1D1AD ],
13648         ) {
13649             if (ref $range) {
13650                 $Alpha->delete_range($range->[0], $range->[1]);
13651             }
13652             else {
13653                 $Alpha->delete_range($range, $range);
13654             }
13655         }
13656         $Alpha->add_description('Alphabetic');
13657         $Alpha->add_alias('Alphabetic');
13658     }
13659     $Alpha->add_alias('XPosixAlpha');
13660     my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
13661                             Description => "[A-Za-z]",
13662                             Initialize => $Alpha & $ASCII,
13663                             );
13664     $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
13665     $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
13666
13667     my $Alnum = $perl->add_match_table('Alnum',
13668                         Description => 'Alphabetic and (decimal) Numeric',
13669                         Initialize => $Alpha + $gc->table('Decimal_Number'),
13670                         );
13671     $Alnum->add_alias('XPosixAlnum');
13672     $perl->add_match_table("PosixAlnum",
13673                             Description => "[A-Za-z0-9]",
13674                             Initialize => $Alnum & $ASCII,
13675                             );
13676
13677     my $Word = $perl->add_match_table('Word',
13678                                 Description => '\w, including beyond ASCII;'
13679                                             . ' = \p{Alnum} + \pM + \p{Pc}',
13680                                 Initialize => $Alnum + $gc->table('Mark'),
13681                                 );
13682     $Word->add_alias('XPosixWord');
13683     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
13684     if (defined $Pc) {
13685         $Word += $Pc;
13686     }
13687     else {
13688         $Word += ord('_');  # Make sure this is a $Word
13689     }
13690     my $JC = property_ref('Join_Control');  # Wasn't in release 1
13691     if (defined $JC) {
13692         $Word += $JC->table('Y');
13693     }
13694     else {
13695         $Word += 0x200C + 0x200D;
13696     }
13697
13698     # This is a Perl extension, so the name doesn't begin with Posix.
13699     my $PerlWord = $perl->add_match_table('PerlWord',
13700                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
13701                     Initialize => $Word & $ASCII,
13702                     );
13703     $PerlWord->add_alias('PosixWord');
13704
13705     my $Blank = $perl->add_match_table('Blank',
13706                                 Description => '\h, Horizontal white space',
13707
13708                                 # 200B is Zero Width Space which is for line
13709                                 # break control, and was listed as
13710                                 # Space_Separator in early releases
13711                                 Initialize => $gc->table('Space_Separator')
13712                                             +   ord("\t")
13713                                             -   0x200B, # ZWSP
13714                                 );
13715     $Blank->add_alias('HorizSpace');        # Another name for it.
13716     $Blank->add_alias('XPosixBlank');
13717     $perl->add_match_table("PosixBlank",
13718                             Description => "\\t and ' '",
13719                             Initialize => $Blank & $ASCII,
13720                             );
13721
13722     my $VertSpace = $perl->add_match_table('VertSpace',
13723                             Description => '\v',
13724                             Initialize =>
13725                                $gc->table('Line_Separator')
13726                              + $gc->table('Paragraph_Separator')
13727                              + utf8::unicode_to_native(0x0A)  # LINE FEED
13728                              + utf8::unicode_to_native(0x0B)  # VERTICAL TAB
13729                              + ord("\f")
13730                              + utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
13731                              + utf8::unicode_to_native(0x85)  # NEL
13732                     );
13733     # No Posix equivalent for vertical space
13734
13735     my $Space = $perl->add_match_table('Space',
13736                 Description => '\s including beyond ASCII and vertical tab',
13737                 Initialize => $Blank + $VertSpace,
13738     );
13739     $Space->add_alias('XPosixSpace');
13740     my $posix_space = $perl->add_match_table("PosixSpace",
13741                             Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
13742                             Initialize => $Space & $ASCII,
13743                             );
13744
13745     # Perl's traditional space doesn't include Vertical Tab prior to v5.18
13746     my $XPerlSpace = $perl->add_match_table('XPerlSpace',
13747                                   Description => '\s, including beyond ASCII',
13748                                   Initialize => $Space,
13749                                   #Initialize => $Space
13750                                   # - utf8::unicode_to_native(0x0B]
13751                                 );
13752     $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
13753     my $PerlSpace = $perl->add_match_table('PerlSpace',
13754                         Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
13755                         Initialize => $XPerlSpace & $ASCII,
13756                             );
13757
13758
13759     my $Cntrl = $perl->add_match_table('Cntrl',
13760                                         Description => 'Control characters');
13761     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
13762     $Cntrl->add_alias('XPosixCntrl');
13763     $perl->add_match_table("PosixCntrl",
13764                             Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
13765                             Initialize => $Cntrl & $ASCII,
13766                             );
13767
13768     # $controls is a temporary used to construct Graph.
13769     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
13770                                                 + $gc->table('Control'));
13771     # Cs not in release 1
13772     $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
13773
13774     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
13775     my $Graph = $perl->add_match_table('Graph',
13776                         Description => 'Characters that are graphical',
13777                         Initialize => ~ ($Space + $controls),
13778                         );
13779     $Graph->add_alias('XPosixGraph');
13780     $perl->add_match_table("PosixGraph",
13781                             Description =>
13782                                 '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
13783                             Initialize => $Graph & $ASCII,
13784                             );
13785
13786     $print = $perl->add_match_table('Print',
13787                         Description => 'Characters that are graphical plus space characters (but no controls)',
13788                         Initialize => $Blank + $Graph - $gc->table('Control'),
13789                         );
13790     $print->add_alias('XPosixPrint');
13791     $perl->add_match_table("PosixPrint",
13792                             Description =>
13793                               '[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
13794                             Initialize => $print & $ASCII,
13795                             );
13796
13797     my $Punct = $perl->add_match_table('Punct');
13798     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
13799
13800     # \p{punct} doesn't include the symbols, which posix does
13801     my $XPosixPunct = $perl->add_match_table('XPosixPunct',
13802                     Description => '\p{Punct} + ASCII-range \p{Symbol}',
13803                     Initialize => $gc->table('Punctuation')
13804                                 + ($ASCII & $gc->table('Symbol')),
13805                                 Perl_Extension => 1
13806         );
13807     $perl->add_match_table('PosixPunct', Perl_Extension => 1,
13808         Description => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
13809         Initialize => $ASCII & $XPosixPunct,
13810         );
13811
13812     my $Digit = $perl->add_match_table('Digit',
13813                             Description => '[0-9] + all other decimal digits');
13814     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
13815     $Digit->add_alias('XPosixDigit');
13816     my $PosixDigit = $perl->add_match_table("PosixDigit",
13817                                             Description => '[0-9]',
13818                                             Initialize => $Digit & $ASCII,
13819                                             );
13820
13821     # Hex_Digit was not present in first release
13822     my $Xdigit = $perl->add_match_table('XDigit');
13823     $Xdigit->add_alias('XPosixXDigit');
13824     my $Hex = property_ref('Hex_Digit');
13825     if (defined $Hex && ! $Hex->is_empty) {
13826         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
13827     }
13828     else {
13829         $Xdigit->initialize([ ord('0') .. ord('9'),
13830                               ord('A') .. ord('F'),
13831                               ord('a') .. ord('f'),
13832                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
13833         $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
13834     }
13835
13836     # AHex was not present in early releases
13837     my $PosixXDigit = $perl->add_match_table('PosixXDigit');
13838     my $AHex = property_ref('ASCII_Hex_Digit');
13839     if (defined $AHex && ! $AHex->is_empty) {
13840         $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
13841     }
13842     else {
13843         $PosixXDigit->initialize($Xdigit & $ASCII);
13844         $PosixXDigit->add_alias('AHex');
13845         $PosixXDigit->add_alias('Ascii_Hex_Digit');
13846     }
13847     $PosixXDigit->add_description('[0-9A-Fa-f]');
13848
13849     my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
13850                     Description => "Code points that particpate in some fold",
13851                     );
13852     my $loc_problem_folds = $perl->add_match_table(
13853                "_Perl_Problematic_Locale_Folds",
13854                Description =>
13855                    "Code points that are in some way problematic under locale",
13856     );
13857
13858     # This allows regexec.c to skip some work when appropriate.  Some of the
13859     # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
13860     my $loc_problem_folds_start = $perl->add_match_table(
13861                "_Perl_Problematic_Locale_Foldeds_Start",
13862                Description =>
13863                    "The first character of every sequence in _Perl_Problematic_Locale_Folds",
13864     );
13865
13866     my $cf = property_ref('Case_Folding');
13867
13868     # Every character 0-255 is problematic because what each folds to depends
13869     # on the current locale
13870     $loc_problem_folds->add_range(0, 255);
13871     $loc_problem_folds_start += $loc_problem_folds;
13872
13873     # Also problematic are anything these fold to outside the range.  Likely
13874     # forever the only thing folded to by these outside the 0-255 range is the
13875     # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
13876     # completely general, which should catch any unexpected changes or errors.
13877     # We look at each code point 0-255, and add its fold (including each part
13878     # of a multi-char fold) to the list.  See commit message
13879     # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
13880     # of the MU issue.
13881     foreach my $range ($loc_problem_folds->ranges) {
13882         foreach my $code_point($range->start .. $range->end) {
13883             my $fold_range = $cf->containing_range($code_point);
13884             next unless defined $fold_range;
13885
13886             my @hex_folds = split " ", $fold_range->value;
13887             my $start_cp = hex $hex_folds[0];
13888             foreach my $i (0 .. @hex_folds - 1) {
13889                 my $cp = hex $hex_folds[$i];
13890                 next unless $cp > 255;    # Already have the < 256 ones
13891
13892                 $loc_problem_folds->add_range($cp, $cp);
13893                 $loc_problem_folds_start->add_range($start_cp, $start_cp);
13894             }
13895         }
13896     }
13897
13898     my $folds_to_multi_char = $perl->add_match_table(
13899          "_Perl_Folds_To_Multi_Char",
13900          Description =>
13901               "Code points whose fold is a string of more than one character",
13902     );
13903
13904     # Look through all the known folds to populate these tables.
13905     foreach my $range ($cf->ranges) {
13906         my $start = $range->start;
13907         my $end = $range->end;
13908         $any_folds->add_range($start, $end);
13909
13910         my @hex_folds = split " ", $range->value;
13911         if (@hex_folds > 1) {   # Is multi-char fold
13912             $folds_to_multi_char->add_range($start, $end);
13913         }
13914
13915         my $found_locale_problematic = 0;
13916
13917         # Look at each of the folded-to characters...
13918         foreach my $i (0 .. @hex_folds - 1) {
13919             my $cp = hex $hex_folds[$i];
13920             $any_folds->add_range($cp, $cp);
13921
13922             # The fold is problematic if any of the folded-to characters is
13923             # already considered problematic.
13924             if ($loc_problem_folds->contains($cp)) {
13925                 $loc_problem_folds->add_range($start, $end);
13926                 $found_locale_problematic = 1;
13927             }
13928         }
13929
13930         # If this is a problematic fold, add to the start chars the
13931         # folding-from characters and first folded-to character.
13932         if ($found_locale_problematic) {
13933             $loc_problem_folds_start->add_range($start, $end);
13934             my $cp = hex $hex_folds[0];
13935             $loc_problem_folds_start->add_range($cp, $cp);
13936         }
13937     }
13938
13939     my $dt = property_ref('Decomposition_Type');
13940     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
13941         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
13942         Perl_Extension => 1,
13943         Note => 'Union of all non-canonical decompositions',
13944         );
13945
13946     # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
13947     # than SD appeared, construct it ourselves, based on the first release SD
13948     # was in.  A pod entry is grandfathered in for it
13949     my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
13950                                            Perl_Extension => 1,
13951                                            Fate => $INTERNAL_ONLY,
13952                                            Status => $DISCOURAGED);
13953     my $soft_dotted = property_ref('Soft_Dotted');
13954     if (defined $soft_dotted && ! $soft_dotted->is_empty) {
13955         $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
13956     }
13957     else {
13958
13959         # This list came from 3.2 Soft_Dotted; all of these code points are in
13960         # all releases
13961         $CanonDCIJ->initialize([ ord('i'),
13962                                  ord('j'),
13963                                  0x012F,
13964                                  0x0268,
13965                                  0x0456,
13966                                  0x0458,
13967                                  0x1E2D,
13968                                  0x1ECB,
13969                                ]);
13970         $CanonDCIJ = $CanonDCIJ & $Assigned;
13971     }
13972
13973     # For backward compatibility, Perl has its own definition for IDStart.
13974     # It is regular XID_Start plus the underscore, but all characters must be
13975     # Word characters as well
13976     my $XID_Start = property_ref('XID_Start');
13977     my $perl_xids = $perl->add_match_table('_Perl_IDStart',
13978                                             Perl_Extension => 1,
13979                                             Fate => $INTERNAL_ONLY,
13980                                             Initialize => ord('_')
13981                                             );
13982     if (defined $XID_Start
13983         || defined ($XID_Start = property_ref('ID_Start')))
13984     {
13985         $perl_xids += $XID_Start->table('Y');
13986     }
13987     else {
13988         # For Unicode versions that don't have the property, construct our own
13989         # from first principles.  The actual definition is:
13990         #     Letters
13991         #   + letter numbers (Nl)
13992         #   - Pattern_Syntax
13993         #   - Pattern_White_Space
13994         #   + stability extensions
13995         #   - NKFC modifications
13996         #
13997         # What we do in the code below is to include the identical code points
13998         # that are in the first release that had Unicode's version of this
13999         # property, essentially extrapolating backwards.  There were no
14000         # stability extensions until v4.1, so none are included; likewise in
14001         # no Unicode version so far do subtracting PatSyn and PatWS make any
14002         # difference, so those also are ignored.
14003         $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
14004
14005         # We do subtract the NFKC modifications that are in the first version
14006         # that had this property.  We don't bother to test if they are in the
14007         # version in question, because if they aren't, the operation is a
14008         # no-op.  The NKFC modifications are discussed in
14009         # http://www.unicode.org/reports/tr31/#NFKC_Modifications
14010         foreach my $range ( 0x037A,
14011                             0x0E33,
14012                             0x0EB3,
14013                             [ 0xFC5E, 0xFC63 ],
14014                             [ 0xFDFA, 0xFE70 ],
14015                             [ 0xFE72, 0xFE76 ],
14016                             0xFE78,
14017                             0xFE7A,
14018                             0xFE7C,
14019                             0xFE7E,
14020                             [ 0xFF9E, 0xFF9F ],
14021         ) {
14022             if (ref $range) {
14023                 $perl_xids->delete_range($range->[0], $range->[1]);
14024             }
14025             else {
14026                 $perl_xids->delete_range($range, $range);
14027             }
14028         }
14029     }
14030
14031     $perl_xids &= $Word;
14032
14033     my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
14034                                         Perl_Extension => 1,
14035                                         Fate => $INTERNAL_ONLY);
14036     my $XIDC = property_ref('XID_Continue');
14037     if (defined $XIDC
14038         || defined ($XIDC = property_ref('ID_Continue')))
14039     {
14040         $perl_xidc += $XIDC->table('Y');
14041     }
14042     else {
14043         # Similarly, we construct our own XIDC if necessary for early Unicode
14044         # versions.  The definition is:
14045         #     everything in XIDS
14046         #   + Gc=Mn
14047         #   + Gc=Mc
14048         #   + Gc=Nd
14049         #   + Gc=Pc
14050         #   - Pattern_Syntax
14051         #   - Pattern_White_Space
14052         #   + stability extensions
14053         #   - NFKC modifications
14054         #
14055         # The same thing applies to this as with XIDS for the PatSyn, PatWS,
14056         # and stability extensions.  There is a somewhat different set of NFKC
14057         # mods to remove (and add in this case).  The ones below make this
14058         # have identical code points as in the first release that defined it.
14059         $perl_xidc += $perl_xids
14060                     + $gc->table('L')
14061                     + $gc->table('Mn')
14062                     + $gc->table('Mc')
14063                     + $gc->table('Nd')
14064                     + utf8::unicode_to_native(0xB7)
14065                     ;
14066         if (defined (my $pc = $gc->table('Pc'))) {
14067             $perl_xidc += $pc;
14068         }
14069         else {  # 1.1.5 didn't have Pc, but these should have been in it
14070             $perl_xidc += 0xFF3F;
14071             $perl_xidc->add_range(0x203F, 0x2040);
14072             $perl_xidc->add_range(0xFE33, 0xFE34);
14073             $perl_xidc->add_range(0xFE4D, 0xFE4F);
14074         }
14075
14076         # Subtract the NFKC mods
14077         foreach my $range ( 0x037A,
14078                             [ 0xFC5E, 0xFC63 ],
14079                             [ 0xFDFA, 0xFE1F ],
14080                             0xFE70,
14081                             [ 0xFE72, 0xFE76 ],
14082                             0xFE78,
14083                             0xFE7A,
14084                             0xFE7C,
14085                             0xFE7E,
14086         ) {
14087             if (ref $range) {
14088                 $perl_xidc->delete_range($range->[0], $range->[1]);
14089             }
14090             else {
14091                 $perl_xidc->delete_range($range, $range);
14092             }
14093         }
14094     }
14095
14096     $perl_xidc &= $Word;
14097
14098     my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
14099                     Perl_Extension => 1,
14100                     Fate => $INTERNAL_ONLY,
14101                     Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
14102                     );
14103
14104     my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
14105                         Perl_Extension => 1,
14106                         Fate => $INTERNAL_ONLY,
14107                         Initialize => $perl_xidc
14108                                     + ord(" ")
14109                                     + ord("(")
14110                                     + ord(")")
14111                                     + ord("-")
14112                                     + utf8::unicode_to_native(0xA0) # NBSP
14113                         );
14114
14115     # These two tables are for matching \X, which is based on the 'extended'
14116     # grapheme cluster, which came in 5.1; create empty ones if not already
14117     # present.  The straight 'grapheme cluster' (non-extended) is used prior
14118     # to 5.1, and differs from the extended (see
14119     # http://www.unicode.org/reports/tr29/) only by these two tables, so we
14120     # get the older definition automatically when they are empty.
14121     my $gcb = property_ref('Grapheme_Cluster_Break');
14122     my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
14123                                         Perl_Extension => 1,
14124                                         Fate => $INTERNAL_ONLY);
14125     if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
14126         $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
14127     }
14128     else {
14129         push @tables_that_may_be_empty, $perl_prepend->complete_name;
14130     }
14131
14132     # All the tables with _X_ in their names are used in defining \X handling,
14133     # and are based on the Unicode GCB property.  Basically, \X matches:
14134     #   CR LF
14135     #   | Prepend* Begin Extend*
14136     #   | .
14137     # Begin is:           ( Special_Begin | ! Control )
14138     # Begin is also:      ( Regular_Begin | Special_Begin )
14139     #   where Regular_Begin is defined as ( ! Control - Special_Begin )
14140     # Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
14141     # Extend is:          ( Grapheme_Extend | Spacing_Mark )
14142     # Control is:         [ GCB_Control | CR | LF ]
14143     # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
14144
14145     foreach my $gcb_name (qw{ L V T LV LVT }) {
14146
14147         # The perl internal extension's name is the gcb table name prepended
14148         # with an '_X_'
14149         my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
14150                                         Perl_Extension => 1,
14151                                         Fate => $INTERNAL_ONLY,
14152                                         Initialize => $gcb->table($gcb_name),
14153                                         );
14154         # Version 1 had mostly different Hangul syllables that were removed
14155         # from later versions, so some of the tables may not apply.
14156         if ($v_version lt v2.0) {
14157             push @tables_that_may_be_empty, $perl_table->complete_name;
14158         }
14159     }
14160
14161     # More GCB.  Populate a combined hangul syllables table
14162     my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
14163                                           Perl_Extension => 1,
14164                                           Fate => $INTERNAL_ONLY);
14165     $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
14166     $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
14167
14168     my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
14169                                     Fate => $INTERNAL_ONLY);
14170     if ($v_version ge v6.2) {
14171         $ri += $gcb->table('RI');
14172     }
14173     else {
14174         push @tables_that_may_be_empty, $ri->full_name;
14175     }
14176
14177     my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start',
14178                                        Perl_Extension => 1,
14179                                        Fate => $INTERNAL_ONLY,
14180                                        Initialize => $lv_lvt_v
14181                                                    + $gcb->table('L')
14182                                                    + $gcb->table('T')
14183                                                    + $ri
14184                                       );
14185     $specials_begin->add_comment(join_lines( <<END
14186 For use in \\X; matches first (perhaps only) character of potential
14187 multi-character sequences that can begin an extended grapheme cluster.  They
14188 need special handling because of their complicated nature.
14189 END
14190     ));
14191     my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
14192                                        Perl_Extension => 1,
14193                                        Fate => $INTERNAL_ONLY,
14194                                        Initialize => ~ $gcb->table('Control')
14195                                                    - $specials_begin
14196                                                    - $gcb->table('CR')
14197                                                    - $gcb->table('LF')
14198                                       );
14199     $regular_begin->add_comment(join_lines( <<END
14200 For use in \\X; matches first character of anything that can begin an extended
14201 grapheme cluster, except those that require special handling.
14202 END
14203     ));
14204
14205     my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
14206                                         Fate => $INTERNAL_ONLY,
14207                                         Initialize => $gcb->table('Extend')
14208                                        );
14209     if (defined (my $sm = $gcb->table('SpacingMark'))) {
14210         $extend += $sm;
14211     }
14212     $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
14213
14214     # End of GCB \X processing
14215
14216     my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
14217
14218     if (@named_sequences) {
14219         push @composition, 'Named_Sequence';
14220         foreach my $sequence (@named_sequences) {
14221             $perl_charname->add_anomalous_entry($sequence);
14222         }
14223     }
14224
14225     my $alias_sentence = "";
14226     my %abbreviations;
14227     my $alias = property_ref('Name_Alias');
14228     $perl_charname->set_proxy_for('Name_Alias');
14229
14230     # Add each entry in Name_Alias to Perl_Charnames.  Where these go with
14231     # respect to any existing entry depends on the entry type.  Corrections go
14232     # before said entry, as they should be returned in preference over the
14233     # existing entry.  (A correction to a correction should be later in the
14234     # Name_Alias table, so it will correctly precede the erroneous correction
14235     # in Perl_Charnames.)
14236     #
14237     # Abbreviations go after everything else, so they are saved temporarily in
14238     # a hash for later.
14239     #
14240     # Everything else is added added afterwards, which preserves the input
14241     # ordering
14242
14243     foreach my $range ($alias->ranges) {
14244         next if $range->value eq "";
14245         my $code_point = $range->start;
14246         if ($code_point != $range->end) {
14247             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
14248         }
14249         my ($value, $type) = split ': ', $range->value;
14250         my $replace_type;
14251         if ($type eq 'correction') {
14252             $replace_type = $MULTIPLE_BEFORE;
14253         }
14254         elsif ($type eq 'abbreviation') {
14255
14256             # Save for later
14257             $abbreviations{$value} = $code_point;
14258             next;
14259         }
14260         else {
14261             $replace_type = $MULTIPLE_AFTER;
14262         }
14263
14264         # Actually add; before or after current entry(ies) as determined
14265         # above.
14266
14267         $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
14268     }
14269     $alias_sentence = <<END;
14270 The Name_Alias property adds duplicate code point entries that are
14271 alternatives to the original name.  If an addition is a corrected
14272 name, it will be physically first in the table.  The original (less correct,
14273 but still valid) name will be next; then any alternatives, in no particular
14274 order; and finally any abbreviations, again in no particular order.
14275 END
14276
14277     # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
14278     # precedence before 6.1, so should be first in the file; the other names
14279     # have precedence starting in 6.1,
14280     my $before_or_after = ($v_version lt v6.1.0)
14281                           ? $MULTIPLE_BEFORE
14282                           : $MULTIPLE_AFTER;
14283
14284     foreach my $range (property_ref('Unicode_1_Name')->ranges) {
14285         my $code_point = $range->start;
14286         my $unicode_1_value = $range->value;
14287         next if $unicode_1_value eq "";     # Skip if name doesn't exist.
14288
14289         if ($code_point != $range->end) {
14290             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
14291         }
14292
14293         # To handle EBCDIC, we don't hard code in the code points of the
14294         # controls; instead realizing that all of them are below 256.
14295         last if $code_point > 255;
14296
14297         # We only add in the controls.
14298         next if $gc->value_of($code_point) ne 'Cc';
14299
14300         # We reject this Unicode1 name for later Perls, as it is used for
14301         # another code point
14302         next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
14303
14304         # This won't add an exact duplicate.
14305         $perl_charname->add_duplicate($code_point, $unicode_1_value,
14306                                         Replace => $before_or_after);
14307     }
14308
14309     # But in this version only, the ALERT has precedence over BELL, the
14310     # Unicode_1_Name that would otherwise have precedence.
14311     if ($v_version eq v6.0.0) {
14312         $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE);
14313     }
14314
14315     # Now that have everything added, add in abbreviations after
14316     # everything else.  Sort so results don't change between runs of this
14317     # program
14318     foreach my $value (sort keys %abbreviations) {
14319         $perl_charname->add_duplicate($abbreviations{$value}, $value,
14320                                         Replace => $MULTIPLE_AFTER);
14321     }
14322
14323     my $comment;
14324     if (@composition <= 2) { # Always at least 2
14325         $comment = join " and ", @composition;
14326     }
14327     else {
14328         $comment = join ", ", @composition[0 .. scalar @composition - 2];
14329         $comment .= ", and $composition[-1]";
14330     }
14331
14332     $perl_charname->add_comment(join_lines( <<END
14333 This file is for charnames.pm.  It is the union of the $comment properties.
14334 Unicode_1_Name entries are used only for nameless code points in the Name
14335 property.
14336 $alias_sentence
14337 This file doesn't include the algorithmically determinable names.  For those,
14338 use 'unicore/Name.pm'
14339 END
14340     ));
14341     property_ref('Name')->add_comment(join_lines( <<END
14342 This file doesn't include the algorithmically determinable names.  For those,
14343 use 'unicore/Name.pm'
14344 END
14345     ));
14346
14347     # Construct the Present_In property from the Age property.
14348     if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
14349         my $default_map = $age->default_map;
14350         my $in = Property->new('In',
14351                                 Default_Map => $default_map,
14352                                 Full_Name => "Present_In",
14353                                 Perl_Extension => 1,
14354                                 Type => $ENUM,
14355                                 Initialize => $age,
14356                                 );
14357         $in->add_comment(join_lines(<<END
14358 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
14359 same as for $age, and not for what $in really means.  This is because anything
14360 defined in a given release should have multiple values: that release and all
14361 higher ones.  But only one value per code point can be represented in a table
14362 like this.
14363 END
14364         ));
14365
14366         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
14367         # lowest numbered (earliest) come first, with the non-numeric one
14368         # last.
14369         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
14370                                             ? 1
14371                                             : ($b->name !~ /^[\d.]*$/)
14372                                                 ? -1
14373                                                 : $a->name <=> $b->name
14374                                             } $age->tables;
14375
14376         # The Present_In property is the cumulative age properties.  The first
14377         # one hence is identical to the first age one.
14378         my $previous_in = $in->add_match_table($first_age->name);
14379         $previous_in->set_equivalent_to($first_age, Related => 1);
14380
14381         my $description_start = "Code point's usage introduced in version ";
14382         $first_age->add_description($description_start . $first_age->name);
14383
14384         # To construct the accumulated values, for each of the age tables
14385         # starting with the 2nd earliest, merge the earliest with it, to get
14386         # all those code points existing in the 2nd earliest.  Repeat merging
14387         # the new 2nd earliest with the 3rd earliest to get all those existing
14388         # in the 3rd earliest, and so on.
14389         foreach my $current_age (@rest_ages) {
14390             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
14391
14392             my $current_in = $in->add_match_table(
14393                                     $current_age->name,
14394                                     Initialize => $current_age + $previous_in,
14395                                     Description => $description_start
14396                                                     . $current_age->name
14397                                                     . ' or earlier',
14398                                     );
14399             $previous_in = $current_in;
14400
14401             # Add clarifying material for the corresponding age file.  This is
14402             # in part because of the confusing and contradictory information
14403             # given in the Standard's documentation itself, as of 5.2.
14404             $current_age->add_description(
14405                             "Code point's usage was introduced in version "
14406                             . $current_age->name);
14407             $current_age->add_note("See also $in");
14408
14409         }
14410
14411         # And finally the code points whose usages have yet to be decided are
14412         # the same in both properties.  Note that permanently unassigned code
14413         # points actually have their usage assigned (as being permanently
14414         # unassigned), so that these tables are not the same as gc=cn.
14415         my $unassigned = $in->add_match_table($default_map);
14416         my $age_default = $age->table($default_map);
14417         $age_default->add_description(<<END
14418 Code point's usage has not been assigned in any Unicode release thus far.
14419 END
14420         );
14421         $unassigned->set_equivalent_to($age_default, Related => 1);
14422     }
14423
14424     # See L<perlfunc/quotemeta>
14425     my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
14426                                            Perl_Extension => 1,
14427                                            Fate => $INTERNAL_ONLY,
14428
14429                                            # Initialize to what's common in
14430                                            # all Unicode releases.
14431                                            Initialize =>
14432                                                 $Space
14433                                                 + $gc->table('Control')
14434                            );
14435
14436     # In early releases without the proper Unicode properties, just set to \W.
14437     if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
14438         || ! defined (my $patws = property_ref('Pattern_White_Space'))
14439         || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
14440     {
14441         $quotemeta += ~ $Word;
14442     }
14443     else {
14444         $quotemeta += $patsyn->table('Y')
14445                    + $patws->table('Y')
14446                    + $di->table('Y')
14447                    + ((~ $Word) & $ASCII);
14448     }
14449
14450     # Finished creating all the perl properties.  All non-internal non-string
14451     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
14452     # an underscore.)  These do not get a separate entry in the pod file
14453     foreach my $table ($perl->tables) {
14454         foreach my $alias ($table->aliases) {
14455             next if $alias->name =~ /^_/;
14456             $table->add_alias('Is_' . $alias->name,
14457                                Re_Pod_Entry => 0,
14458                                UCD => 0,
14459                                Status => $alias->status,
14460                                OK_as_Filename => 0);
14461         }
14462     }
14463
14464     # Here done with all the basic stuff.  Ready to populate the information
14465     # about each character if annotating them.
14466     if ($annotate) {
14467
14468         # See comments at its declaration
14469         $annotate_ranges = Range_Map->new;
14470
14471         # This separates out the non-characters from the other unassigneds, so
14472         # can give different annotations for each.
14473         $unassigned_sans_noncharacters = Range_List->new(
14474                                     Initialize => $gc->table('Unassigned'));
14475         if (defined (my $nonchars = property_ref('Noncharacter_Code_Point'))) {
14476             $unassigned_sans_noncharacters &= $nonchars->table('N');
14477         }
14478
14479         for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
14480             $i = populate_char_info($i);    # Note sets $i so may cause skips
14481
14482         }
14483     }
14484
14485     return;
14486 }
14487
14488 sub add_perl_synonyms() {
14489     # A number of Unicode tables have Perl synonyms that are expressed in
14490     # the single-form, \p{name}.  These are:
14491     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
14492     #       \p{Is_Name} as synonyms
14493     #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
14494     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
14495     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
14496     #       conflict, \p{Value} and \p{Is_Value} as well
14497     #
14498     # This routine generates these synonyms, warning of any unexpected
14499     # conflicts.
14500
14501     # Construct the list of tables to get synonyms for.  Start with all the
14502     # binary and the General_Category ones.
14503     my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
14504                                                             property_ref('*');
14505     push @tables, $gc->tables;
14506
14507     # If the version of Unicode includes the Script property, add its tables
14508     push @tables, $script->tables if defined $script;
14509
14510     # The Block tables are kept separate because they are treated differently.
14511     # And the earliest versions of Unicode didn't include them, so add only if
14512     # there are some.
14513     my @blocks;
14514     push @blocks, $block->tables if defined $block;
14515
14516     # Here, have the lists of tables constructed.  Process blocks last so that
14517     # if there are name collisions with them, blocks have lowest priority.
14518     # Should there ever be other collisions, manual intervention would be
14519     # required.  See the comments at the beginning of the program for a
14520     # possible way to handle those semi-automatically.
14521     foreach my $table (@tables,  @blocks) {
14522
14523         # For non-binary properties, the synonym is just the name of the
14524         # table, like Greek, but for binary properties the synonym is the name
14525         # of the property, and means the code points in its 'Y' table.
14526         my $nominal = $table;
14527         my $nominal_property = $nominal->property;
14528         my $actual;
14529         if (! $nominal->isa('Property')) {
14530             $actual = $table;
14531         }
14532         else {
14533
14534             # Here is a binary property.  Use the 'Y' table.  Verify that is
14535             # there
14536             my $yes = $nominal->table('Y');
14537             unless (defined $yes) {  # Must be defined, but is permissible to
14538                                      # be empty.
14539                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
14540                 next;
14541             }
14542             $actual = $yes;
14543         }
14544
14545         foreach my $alias ($nominal->aliases) {
14546
14547             # Attempt to create a table in the perl directory for the
14548             # candidate table, using whatever aliases in it that don't
14549             # conflict.  Also add non-conflicting aliases for all these
14550             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
14551             PREFIX:
14552             foreach my $prefix ("", 'Is_', 'In_') {
14553
14554                 # Only Block properties can have added 'In_' aliases.
14555                 next if $prefix eq 'In_' and $nominal_property != $block;
14556
14557                 my $proposed_name = $prefix . $alias->name;
14558
14559                 # No Is_Is, In_In, nor combinations thereof
14560                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
14561                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
14562
14563                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
14564
14565                 # Get a reference to any existing table in the perl
14566                 # directory with the desired name.
14567                 my $pre_existing = $perl->table($proposed_name);
14568
14569                 if (! defined $pre_existing) {
14570
14571                     # No name collision, so ok to add the perl synonym.
14572
14573                     my $make_re_pod_entry;
14574                     my $ok_as_filename;
14575                     my $status = $alias->status;
14576                     if ($nominal_property == $block) {
14577
14578                         # For block properties, the 'In' form is preferred for
14579                         # external use; the pod file contains wild cards for
14580                         # this and the 'Is' form so no entries for those; and
14581                         # we don't want people using the name without the
14582                         # 'In', so discourage that.
14583                         if ($prefix eq "") {
14584                             $make_re_pod_entry = 1;
14585                             $status = $status || $DISCOURAGED;
14586                             $ok_as_filename = 0;
14587                         }
14588                         elsif ($prefix eq 'In_') {
14589                             $make_re_pod_entry = 0;
14590                             $status = $status || $NORMAL;
14591                             $ok_as_filename = 1;
14592                         }
14593                         else {
14594                             $make_re_pod_entry = 0;
14595                             $status = $status || $DISCOURAGED;
14596                             $ok_as_filename = 0;
14597                         }
14598                     }
14599                     elsif ($prefix ne "") {
14600
14601                         # The 'Is' prefix is handled in the pod by a wild
14602                         # card, and we won't use it for an external name
14603                         $make_re_pod_entry = 0;
14604                         $status = $status || $NORMAL;
14605                         $ok_as_filename = 0;
14606                     }
14607                     else {
14608
14609                         # Here, is an empty prefix, non block.  This gets its
14610                         # own pod entry and can be used for an external name.
14611                         $make_re_pod_entry = 1;
14612                         $status = $status || $NORMAL;
14613                         $ok_as_filename = 1;
14614                     }
14615
14616                     # Here, there isn't a perl pre-existing table with the
14617                     # name.  Look through the list of equivalents of this
14618                     # table to see if one is a perl table.
14619                     foreach my $equivalent ($actual->leader->equivalents) {
14620                         next if $equivalent->property != $perl;
14621
14622                         # Here, have found a table for $perl.  Add this alias
14623                         # to it, and are done with this prefix.
14624                         $equivalent->add_alias($proposed_name,
14625                                         Re_Pod_Entry => $make_re_pod_entry,
14626
14627                                         # Currently don't output these in the
14628                                         # ucd pod, as are strongly discouraged
14629                                         # from being used
14630                                         UCD => 0,
14631
14632                                         Status => $status,
14633                                         OK_as_Filename => $ok_as_filename);
14634                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
14635                         next PREFIX;
14636                     }
14637
14638                     # Here, $perl doesn't already have a table that is a
14639                     # synonym for this property, add one.
14640                     my $added_table = $perl->add_match_table($proposed_name,
14641                                             Re_Pod_Entry => $make_re_pod_entry,
14642
14643                                             # See UCD comment just above
14644                                             UCD => 0,
14645
14646                                             Status => $status,
14647                                             OK_as_Filename => $ok_as_filename);
14648                     # And it will be related to the actual table, since it is
14649                     # based on it.
14650                     $added_table->set_equivalent_to($actual, Related => 1);
14651                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
14652                     next;
14653                 } # End of no pre-existing.
14654
14655                 # Here, there is a pre-existing table that has the proposed
14656                 # name.  We could be in trouble, but not if this is just a
14657                 # synonym for another table that we have already made a child
14658                 # of the pre-existing one.
14659                 if ($pre_existing->is_set_equivalent_to($actual)) {
14660                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
14661                     $pre_existing->add_alias($proposed_name);
14662                     next;
14663                 }
14664
14665                 # Here, there is a name collision, but it still could be ok if
14666                 # the tables match the identical set of code points, in which
14667                 # case, we can combine the names.  Compare each table's code
14668                 # point list to see if they are identical.
14669                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
14670                 if ($pre_existing->matches_identically_to($actual)) {
14671
14672                     # Here, they do match identically.  Not a real conflict.
14673                     # Make the perl version a child of the Unicode one, except
14674                     # in the non-obvious case of where the perl name is
14675                     # already a synonym of another Unicode property.  (This is
14676                     # excluded by the test for it being its own parent.)  The
14677                     # reason for this exclusion is that then the two Unicode
14678                     # properties become related; and we don't really know if
14679                     # they are or not.  We generate documentation based on
14680                     # relatedness, and this would be misleading.  Code
14681                     # later executed in the process will cause the tables to
14682                     # be represented by a single file anyway, without making
14683                     # it look in the pod like they are necessarily related.
14684                     if ($pre_existing->parent == $pre_existing
14685                         && ($pre_existing->property == $perl
14686                             || $actual->property == $perl))
14687                     {
14688                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
14689                         $pre_existing->set_equivalent_to($actual, Related => 1);
14690                     }
14691                     elsif (main::DEBUG && $to_trace) {
14692                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
14693                         trace $pre_existing->parent;
14694                     }
14695                     next PREFIX;
14696                 }
14697
14698                 # Here they didn't match identically, there is a real conflict
14699                 # between our new name and a pre-existing property.
14700                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
14701                 $pre_existing->add_conflicting($nominal->full_name,
14702                                                'p',
14703                                                $actual);
14704
14705                 # Don't output a warning for aliases for the block
14706                 # properties (unless they start with 'In_') as it is
14707                 # expected that there will be conflicts and the block
14708                 # form loses.
14709                 if ($verbosity >= $NORMAL_VERBOSITY
14710                     && ($actual->property != $block || $prefix eq 'In_'))
14711                 {
14712                     print simple_fold(join_lines(<<END
14713 There is already an alias named $proposed_name (from $pre_existing),
14714 so not creating this alias for $actual
14715 END
14716                     ), "", 4);
14717                 }
14718
14719                 # Keep track for documentation purposes.
14720                 $has_In_conflicts++ if $prefix eq 'In_';
14721                 $has_Is_conflicts++ if $prefix eq 'Is_';
14722             }
14723         }
14724     }
14725
14726     # There are some properties which have No and Yes (and N and Y) as
14727     # property values, but aren't binary, and could possibly be confused with
14728     # binary ones.  So create caveats for them.  There are tables that are
14729     # named 'No', and tables that are named 'N', but confusion is not likely
14730     # unless they are the same table.  For example, N meaning Number or
14731     # Neutral is not likely to cause confusion, so don't add caveats to things
14732     # like them.
14733     foreach my $property (grep { $_->type != $BINARY
14734                                  && $_->type != $FORCED_BINARY }
14735                                                             property_ref('*'))
14736     {
14737         my $yes = $property->table('Yes');
14738         if (defined $yes) {
14739             my $y = $property->table('Y');
14740             if (defined $y && $yes == $y) {
14741                 foreach my $alias ($property->aliases) {
14742                     $yes->add_conflicting($alias->name);
14743                 }
14744             }
14745         }
14746         my $no = $property->table('No');
14747         if (defined $no) {
14748             my $n = $property->table('N');
14749             if (defined $n && $no == $n) {
14750                 foreach my $alias ($property->aliases) {
14751                     $no->add_conflicting($alias->name, 'P');
14752                 }
14753             }
14754         }
14755     }
14756
14757     return;
14758 }
14759
14760 sub register_file_for_name($$$) {
14761     # Given info about a table and a datafile that it should be associated
14762     # with, register that association
14763
14764     my $table = shift;
14765     my $directory_ref = shift;   # Array of the directory path for the file
14766     my $file = shift;            # The file name in the final directory.
14767     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14768
14769     trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
14770
14771     if ($table->isa('Property')) {
14772         $table->set_file_path(@$directory_ref, $file);
14773         push @map_properties, $table;
14774
14775         # No swash means don't do the rest of this.
14776         return if $table->fate != $ORDINARY;
14777
14778         # Get the path to the file
14779         my @path = $table->file_path;
14780
14781         # Use just the file name if no subdirectory.
14782         shift @path if $path[0] eq File::Spec->curdir();
14783
14784         my $file = join '/', @path;
14785
14786         # Create a hash entry for utf8_heavy to get the file that stores this
14787         # property's map table
14788         foreach my $alias ($table->aliases) {
14789             my $name = $alias->name;
14790             $loose_property_to_file_of{standardize($name)} = $file;
14791         }
14792
14793         # And a way for utf8_heavy to find the proper key in the SwashInfo
14794         # hash for this property.
14795         $file_to_swash_name{$file} = "To" . $table->swash_name;
14796         return;
14797     }
14798
14799     # Do all of the work for all equivalent tables when called with the leader
14800     # table, so skip if isn't the leader.
14801     return if $table->leader != $table;
14802
14803     # If this is a complement of another file, use that other file instead,
14804     # with a ! prepended to it.
14805     my $complement;
14806     if (($complement = $table->complement) != 0) {
14807         my @directories = $complement->file_path;
14808
14809         # This assumes that the 0th element is something like 'lib',
14810         # the 1th element the property name (in its own directory), like
14811         # 'AHex', and the 2th element the file like 'Y' which will have a .pl
14812         # appended to it later.
14813         $directories[1] =~ s/^/!/;
14814         $file = pop @directories;
14815         $directory_ref =\@directories;
14816     }
14817
14818     # Join all the file path components together, using slashes.
14819     my $full_filename = join('/', @$directory_ref, $file);
14820
14821     # All go in the same subdirectory of unicore
14822     if ($directory_ref->[0] ne $matches_directory) {
14823         Carp::my_carp("Unexpected directory in "
14824                 .  join('/', @{$directory_ref}, $file));
14825     }
14826
14827     # For this table and all its equivalents ...
14828     foreach my $table ($table, $table->equivalents) {
14829
14830         # Associate it with its file internally.  Don't include the
14831         # $matches_directory first component
14832         $table->set_file_path(@$directory_ref, $file);
14833
14834         # No swash means don't do the rest of this.
14835         next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
14836
14837         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
14838
14839         my $property = $table->property;
14840         my $property_name = ($property == $perl)
14841                              ? ""  # 'perl' is never explicitly stated
14842                              : standardize($property->name) . '=';
14843
14844         my $is_default = 0; # Is this table the default one for the property?
14845
14846         # To calculate $is_default, we find if this table is the same as the
14847         # default one for the property.  But this is complicated by the
14848         # possibility that there is a master table for this one, and the
14849         # information is stored there instead of here.
14850         my $parent = $table->parent;
14851         my $leader_prop = $parent->property;
14852         my $default_map = $leader_prop->default_map;
14853         if (defined $default_map) {
14854             my $default_table = $leader_prop->table($default_map);
14855             $is_default = 1 if defined $default_table && $parent == $default_table;
14856         }
14857
14858         # Calculate the loose name for this table.  Mostly it's just its name,
14859         # standardized.  But in the case of Perl tables that are single-form
14860         # equivalents to Unicode properties, it is the latter's name.
14861         my $loose_table_name =
14862                         ($property != $perl || $leader_prop == $perl)
14863                         ? standardize($table->name)
14864                         : standardize($parent->name);
14865
14866         my $deprecated = ($table->status eq $DEPRECATED)
14867                          ? $table->status_info
14868                          : "";
14869         my $caseless_equivalent = $table->caseless_equivalent;
14870
14871         # And for each of the table's aliases...  This inner loop eventually
14872         # goes through all aliases in the UCD that we generate regex match
14873         # files for
14874         foreach my $alias ($table->aliases) {
14875             my $standard = utf8_heavy_name($table, $alias);
14876
14877             # Generate an entry in either the loose or strict hashes, which
14878             # will translate the property and alias names combination into the
14879             # file where the table for them is stored.
14880             if ($alias->loose_match) {
14881                 if (exists $loose_to_file_of{$standard}) {
14882                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
14883                 }
14884                 else {
14885                     $loose_to_file_of{$standard} = $sub_filename;
14886                 }
14887             }
14888             else {
14889                 if (exists $stricter_to_file_of{$standard}) {
14890                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
14891                 }
14892                 else {
14893                     $stricter_to_file_of{$standard} = $sub_filename;
14894
14895                     # Tightly coupled with how utf8_heavy.pl works, for a
14896                     # floating point number that is a whole number, get rid of
14897                     # the trailing decimal point and 0's, so that utf8_heavy
14898                     # will work.  Also note that this assumes that such a
14899                     # number is matched strictly; so if that were to change,
14900                     # this would be wrong.
14901                     if ((my $integer_name = $alias->name)
14902                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
14903                     {
14904                         $stricter_to_file_of{$property_name . $integer_name}
14905                                                             = $sub_filename;
14906                     }
14907                 }
14908             }
14909
14910             # For Unicode::UCD, create a mapping of the prop=value to the
14911             # canonical =value for that property.
14912             if ($standard =~ /=/) {
14913
14914                 # This could happen if a strict name mapped into an existing
14915                 # loose name.  In that event, the strict names would have to
14916                 # be moved to a new hash.
14917                 if (exists($loose_to_standard_value{$standard})) {
14918                     Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
14919                 }
14920                 $loose_to_standard_value{$standard} = $loose_table_name;
14921             }
14922
14923             # Keep a list of the deprecated properties and their filenames
14924             if ($deprecated && $complement == 0) {
14925                 $utf8::why_deprecated{$sub_filename} = $deprecated;
14926             }
14927
14928             # And a substitute table, if any, for case-insensitive matching
14929             if ($caseless_equivalent != 0) {
14930                 $caseless_equivalent_to{$standard} = $caseless_equivalent;
14931             }
14932
14933             # Add to defaults list if the table this alias belongs to is the
14934             # default one
14935             $loose_defaults{$standard} = 1 if $is_default;
14936         }
14937     }
14938
14939     return;
14940 }
14941
14942 {   # Closure
14943     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
14944                      # conflicts
14945     my %full_dir_name_of;   # Full length names of directories used.
14946
14947     sub construct_filename($$$) {
14948         # Return a file name for a table, based on the table name, but perhaps
14949         # changed to get rid of non-portable characters in it, and to make
14950         # sure that it is unique on a file system that allows the names before
14951         # any period to be at most 8 characters (DOS).  While we're at it
14952         # check and complain if there are any directory conflicts.
14953
14954         my $name = shift;       # The name to start with
14955         my $mutable = shift;    # Boolean: can it be changed?  If no, but
14956                                 # yet it must be to work properly, a warning
14957                                 # is given
14958         my $directories_ref = shift;  # A reference to an array containing the
14959                                 # path to the file, with each element one path
14960                                 # component.  This is used because the same
14961                                 # name can be used in different directories.
14962         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14963
14964         my $warn = ! defined wantarray;  # If true, then if the name is
14965                                 # changed, a warning is issued as well.
14966
14967         if (! defined $name) {
14968             Carp::my_carp("Undefined name in directory "
14969                           . File::Spec->join(@$directories_ref)
14970                           . ". '_' used");
14971             return '_';
14972         }
14973
14974         # Make sure that no directory names conflict with each other.  Look at
14975         # each directory in the input file's path.  If it is already in use,
14976         # assume it is correct, and is merely being re-used, but if we
14977         # truncate it to 8 characters, and find that there are two directories
14978         # that are the same for the first 8 characters, but differ after that,
14979         # then that is a problem.
14980         foreach my $directory (@$directories_ref) {
14981             my $short_dir = substr($directory, 0, 8);
14982             if (defined $full_dir_name_of{$short_dir}) {
14983                 next if $full_dir_name_of{$short_dir} eq $directory;
14984                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
14985             }
14986             else {
14987                 $full_dir_name_of{$short_dir} = $directory;
14988             }
14989         }
14990
14991         my $path = join '/', @$directories_ref;
14992         $path .= '/' if $path;
14993
14994         # Remove interior underscores.
14995         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
14996
14997         # Change any non-word character into an underscore, and truncate to 8.
14998         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
14999         substr($filename, 8) = "" if length($filename) > 8;
15000
15001         # Make sure the basename doesn't conflict with something we
15002         # might have already written. If we have, say,
15003         #     InGreekExtended1
15004         #     InGreekExtended2
15005         # they become
15006         #     InGreekE
15007         #     InGreek2
15008         my $warned = 0;
15009         while (my $num = $base_names{$path}{lc $filename}++) {
15010             $num++; # so basenames with numbers start with '2', which
15011                     # just looks more natural.
15012
15013             # Want to append $num, but if it'll make the basename longer
15014             # than 8 characters, pre-truncate $filename so that the result
15015             # is acceptable.
15016             my $delta = length($filename) + length($num) - 8;
15017             if ($delta > 0) {
15018                 substr($filename, -$delta) = $num;
15019             }
15020             else {
15021                 $filename .= $num;
15022             }
15023             if ($warn && ! $warned) {
15024                 $warned = 1;
15025                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
15026             }
15027         }
15028
15029         return $filename if $mutable;
15030
15031         # If not changeable, must return the input name, but warn if needed to
15032         # change it beyond shortening it.
15033         if ($name ne $filename
15034             && substr($name, 0, length($filename)) ne $filename) {
15035             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
15036         }
15037         return $name;
15038     }
15039 }
15040
15041 # The pod file contains a very large table.  Many of the lines in that table
15042 # would exceed a typical output window's size, and so need to be wrapped with
15043 # a hanging indent to make them look good.  The pod language is really
15044 # insufficient here.  There is no general construct to do that in pod, so it
15045 # is done here by beginning each such line with a space to cause the result to
15046 # be output without formatting, and doing all the formatting here.  This leads
15047 # to the result that if the eventual display window is too narrow it won't
15048 # look good, and if the window is too wide, no advantage is taken of that
15049 # extra width.  A further complication is that the output may be indented by
15050 # the formatter so that there is less space than expected.  What I (khw) have
15051 # done is to assume that that indent is a particular number of spaces based on
15052 # what it is in my Linux system;  people can always resize their windows if
15053 # necessary, but this is obviously less than desirable, but the best that can
15054 # be expected.
15055 my $automatic_pod_indent = 8;
15056
15057 # Try to format so that uses fewest lines, but few long left column entries
15058 # slide into the right column.  An experiment on 5.1 data yielded the
15059 # following percentages that didn't cut into the other side along with the
15060 # associated first-column widths
15061 # 69% = 24
15062 # 80% not too bad except for a few blocks
15063 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
15064 # 95% = 37;
15065 my $indent_info_column = 27;    # 75% of lines didn't have overlap
15066
15067 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
15068                     # The 3 is because of:
15069                     #   1   for the leading space to tell the pod formatter to
15070                     #       output as-is
15071                     #   1   for the flag
15072                     #   1   for the space between the flag and the main data
15073
15074 sub format_pod_line ($$$;$$) {
15075     # Take a pod line and return it, formatted properly
15076
15077     my $first_column_width = shift;
15078     my $entry = shift;  # Contents of left column
15079     my $info = shift;   # Contents of right column
15080
15081     my $status = shift || "";   # Any flag
15082
15083     my $loose_match = shift;    # Boolean.
15084     $loose_match = 1 unless defined $loose_match;
15085
15086     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15087
15088     my $flags = "";
15089     $flags .= $STRICTER if ! $loose_match;
15090
15091     $flags .= $status if $status;
15092
15093     # There is a blank in the left column to cause the pod formatter to
15094     # output the line as-is.
15095     return sprintf " %-*s%-*s %s\n",
15096                     # The first * in the format is replaced by this, the -1 is
15097                     # to account for the leading blank.  There isn't a
15098                     # hard-coded blank after this to separate the flags from
15099                     # the rest of the line, so that in the unlikely event that
15100                     # multiple flags are shown on the same line, they both
15101                     # will get displayed at the expense of that separation,
15102                     # but since they are left justified, a blank will be
15103                     # inserted in the normal case.
15104                     $FILLER - 1,
15105                     $flags,
15106
15107                     # The other * in the format is replaced by this number to
15108                     # cause the first main column to right fill with blanks.
15109                     # The -1 is for the guaranteed blank following it.
15110                     $first_column_width - $FILLER - 1,
15111                     $entry,
15112                     $info;
15113 }
15114
15115 my @zero_match_tables;  # List of tables that have no matches in this release
15116
15117 sub make_re_pod_entries($) {
15118     # This generates the entries for the pod file for a given table.
15119     # Also done at this time are any children tables.  The output looks like:
15120     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
15121
15122     my $input_table = shift;        # Table the entry is for
15123     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15124
15125     # Generate parent and all its children at the same time.
15126     return if $input_table->parent != $input_table;
15127
15128     my $property = $input_table->property;
15129     my $type = $property->type;
15130     my $full_name = $property->full_name;
15131
15132     my $count = $input_table->count;
15133     my $unicode_count;
15134     my $non_unicode_string;
15135     if ($count > $MAX_UNICODE_CODEPOINTS) {
15136         $unicode_count = $count - ($MAX_WORKING_CODEPOINT
15137                                     - $MAX_UNICODE_CODEPOINT);
15138         $non_unicode_string = " plus all above-Unicode code points";
15139     }
15140     else {
15141         $unicode_count = $count;
15142         $non_unicode_string = "";
15143     }
15144     my $string_count = clarify_number($unicode_count) . $non_unicode_string;
15145     my $status = $input_table->status;
15146     my $status_info = $input_table->status_info;
15147     my $caseless_equivalent = $input_table->caseless_equivalent;
15148
15149     # Don't mention a placeholder equivalent as it isn't to be listed in the
15150     # pod
15151     $caseless_equivalent = 0 if $caseless_equivalent != 0
15152                                 && $caseless_equivalent->fate > $ORDINARY;
15153
15154     my $entry_for_first_table; # The entry for the first table output.
15155                            # Almost certainly, it is the parent.
15156
15157     # For each related table (including itself), we will generate a pod entry
15158     # for each name each table goes by
15159     foreach my $table ($input_table, $input_table->children) {
15160
15161         # utf8_heavy.pl cannot deal with null string property values, so skip
15162         # any tables that have no non-null names.
15163         next if ! grep { $_->name ne "" } $table->aliases;
15164
15165         # First, gather all the info that applies to this table as a whole.
15166
15167         push @zero_match_tables, $table if $count == 0
15168                                             # Don't mention special tables
15169                                             # as being zero length
15170                                            && $table->fate == $ORDINARY;
15171
15172         my $table_property = $table->property;
15173
15174         # The short name has all the underscores removed, while the full name
15175         # retains them.  Later, we decide whether to output a short synonym
15176         # for the full one, we need to compare apples to apples, so we use the
15177         # short name's length including underscores.
15178         my $table_property_short_name_length;
15179         my $table_property_short_name
15180             = $table_property->short_name(\$table_property_short_name_length);
15181         my $table_property_full_name = $table_property->full_name;
15182
15183         # Get how much savings there is in the short name over the full one
15184         # (delta will always be <= 0)
15185         my $table_property_short_delta = $table_property_short_name_length
15186                                          - length($table_property_full_name);
15187         my @table_description = $table->description;
15188         my @table_note = $table->note;
15189
15190         # Generate an entry for each alias in this table.
15191         my $entry_for_first_alias;  # saves the first one encountered.
15192         foreach my $alias ($table->aliases) {
15193
15194             # Skip if not to go in pod.
15195             next unless $alias->make_re_pod_entry;
15196
15197             # Start gathering all the components for the entry
15198             my $name = $alias->name;
15199
15200             # Skip if name is empty, as can't be accessed by regexes.
15201             next if $name eq "";
15202
15203             my $entry;      # Holds the left column, may include extras
15204             my $entry_ref;  # To refer to the left column's contents from
15205                             # another entry; has no extras
15206
15207             # First the left column of the pod entry.  Tables for the $perl
15208             # property always use the single form.
15209             if ($table_property == $perl) {
15210                 $entry = "\\p{$name}";
15211                 $entry_ref = "\\p{$name}";
15212             }
15213             else {    # Compound form.
15214
15215                 # Only generate one entry for all the aliases that mean true
15216                 # or false in binary properties.  Append a '*' to indicate
15217                 # some are missing.  (The heading comment notes this.)
15218                 my $rhs;
15219                 if ($type == $BINARY) {
15220                     next if $name ne 'N' && $name ne 'Y';
15221                     $rhs = "$name*";
15222                 }
15223                 elsif ($type != $FORCED_BINARY) {
15224                     $rhs = $name;
15225                 }
15226                 else {
15227
15228                     # Forced binary properties require special handling.  It
15229                     # has two sets of tables, one set is true/false; and the
15230                     # other set is everything else.  Entries are generated for
15231                     # each set.  Use the Bidi_Mirrored property (which appears
15232                     # in all Unicode versions) to get a list of the aliases
15233                     # for the true/false tables.  Of these, only output the N
15234                     # and Y ones, the same as, a regular binary property.  And
15235                     # output all the rest, same as a non-binary property.
15236                     my $bm = property_ref("Bidi_Mirrored");
15237                     if ($name eq 'N' || $name eq 'Y') {
15238                         $rhs = "$name*";
15239                     } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
15240                                                         $bm->table("N")->aliases)
15241                     {
15242                         next;
15243                     }
15244                     else {
15245                         $rhs = $name;
15246                     }
15247                 }
15248
15249                 # Colon-space is used to give a little more space to be easier
15250                 # to read;
15251                 $entry = "\\p{"
15252                         . $table_property_full_name
15253                         . ": $rhs}";
15254
15255                 # But for the reference to this entry, which will go in the
15256                 # right column, where space is at a premium, use equals
15257                 # without a space
15258                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
15259             }
15260
15261             # Then the right (info) column.  This is stored as components of
15262             # an array for the moment, then joined into a string later.  For
15263             # non-internal only properties, begin the info with the entry for
15264             # the first table we encountered (if any), as things are ordered
15265             # so that that one is the most descriptive.  This leads to the
15266             # info column of an entry being a more descriptive version of the
15267             # name column
15268             my @info;
15269             if ($name =~ /^_/) {
15270                 push @info,
15271                         '(For internal use by Perl, not necessarily stable)';
15272             }
15273             elsif ($entry_for_first_alias) {
15274                 push @info, $entry_for_first_alias;
15275             }
15276
15277             # If this entry is equivalent to another, add that to the info,
15278             # using the first such table we encountered
15279             if ($entry_for_first_table) {
15280                 if (@info) {
15281                     push @info, "(= $entry_for_first_table)";
15282                 }
15283                 else {
15284                     push @info, $entry_for_first_table;
15285                 }
15286             }
15287
15288             # If the name is a large integer, add an equivalent with an
15289             # exponent for better readability
15290             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
15291                 push @info, sprintf "(= %.1e)", $name
15292             }
15293
15294             my $parenthesized = "";
15295             if (! $entry_for_first_alias) {
15296
15297                 # This is the first alias for the current table.  The alias
15298                 # array is ordered so that this is the fullest, most
15299                 # descriptive alias, so it gets the fullest info.  The other
15300                 # aliases are mostly merely pointers to this one, using the
15301                 # information already added above.
15302
15303                 # Display any status message, but only on the parent table
15304                 if ($status && ! $entry_for_first_table) {
15305                     push @info, $status_info;
15306                 }
15307
15308                 # Put out any descriptive info
15309                 if (@table_description || @table_note) {
15310                     push @info, join "; ", @table_description, @table_note;
15311                 }
15312
15313                 # Look to see if there is a shorter name we can point people
15314                 # at
15315                 my $standard_name = standardize($name);
15316                 my $short_name;
15317                 my $proposed_short = $table->short_name;
15318                 if (defined $proposed_short) {
15319                     my $standard_short = standardize($proposed_short);
15320
15321                     # If the short name is shorter than the standard one, or
15322                     # even it it's not, but the combination of it and its
15323                     # short property name (as in \p{prop=short} ($perl doesn't
15324                     # have this form)) saves at least two characters, then,
15325                     # cause it to be listed as a shorter synonym.
15326                     if (length $standard_short < length $standard_name
15327                         || ($table_property != $perl
15328                             && (length($standard_short)
15329                                 - length($standard_name)
15330                                 + $table_property_short_delta)  # (<= 0)
15331                                 < -2))
15332                     {
15333                         $short_name = $proposed_short;
15334                         if ($table_property != $perl) {
15335                             $short_name = $table_property_short_name
15336                                           . "=$short_name";
15337                         }
15338                         $short_name = "\\p{$short_name}";
15339                     }
15340                 }
15341
15342                 # And if this is a compound form name, see if there is a
15343                 # single form equivalent
15344                 my $single_form;
15345                 if ($table_property != $perl) {
15346
15347                     # Special case the binary N tables, so that will print
15348                     # \P{single}, but use the Y table values to populate
15349                     # 'single', as we haven't likewise populated the N table.
15350                     # For forced binary tables, we can't just look at the N
15351                     # table, but must see if this table is equivalent to the N
15352                     # one, as there are two equivalent beasts in these
15353                     # properties.
15354                     my $test_table;
15355                     my $p;
15356                     if (   ($type == $BINARY
15357                             && $input_table == $property->table('No'))
15358                         || ($type == $FORCED_BINARY
15359                             && $property->table('No')->
15360                                         is_set_equivalent_to($input_table)))
15361                     {
15362                         $test_table = $property->table('Yes');
15363                         $p = 'P';
15364                     }
15365                     else {
15366                         $test_table = $input_table;
15367                         $p = 'p';
15368                     }
15369
15370                     # Look for a single form amongst all the children.
15371                     foreach my $table ($test_table->children) {
15372                         next if $table->property != $perl;
15373                         my $proposed_name = $table->short_name;
15374                         next if ! defined $proposed_name;
15375
15376                         # Don't mention internal-only properties as a possible
15377                         # single form synonym
15378                         next if substr($proposed_name, 0, 1) eq '_';
15379
15380                         $proposed_name = "\\$p\{$proposed_name}";
15381                         if (! defined $single_form
15382                             || length($proposed_name) < length $single_form)
15383                         {
15384                             $single_form = $proposed_name;
15385
15386                             # The goal here is to find a single form; not the
15387                             # shortest possible one.  We've already found a
15388                             # short name.  So, stop at the first single form
15389                             # found, which is likely to be closer to the
15390                             # original.
15391                             last;
15392                         }
15393                     }
15394                 }
15395
15396                 # Ouput both short and single in the same parenthesized
15397                 # expression, but with only one of 'Single', 'Short' if there
15398                 # are both items.
15399                 if ($short_name || $single_form || $table->conflicting) {
15400                     $parenthesized .= "Short: $short_name" if $short_name;
15401                     if ($short_name && $single_form) {
15402                         $parenthesized .= ', ';
15403                     }
15404                     elsif ($single_form) {
15405                         $parenthesized .= 'Single: ';
15406                     }
15407                     $parenthesized .= $single_form if $single_form;
15408                 }
15409             }
15410
15411             if ($caseless_equivalent != 0) {
15412                 $parenthesized .=  '; ' if $parenthesized ne "";
15413                 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
15414             }
15415
15416
15417             # Warn if this property isn't the same as one that a
15418             # semi-casual user might expect.  The other components of this
15419             # parenthesized structure are calculated only for the first entry
15420             # for this table, but the conflicting is deemed important enough
15421             # to go on every entry.
15422             my $conflicting = join " NOR ", $table->conflicting;
15423             if ($conflicting) {
15424                 $parenthesized .=  '; ' if $parenthesized ne "";
15425                 $parenthesized .= "NOT $conflicting";
15426             }
15427
15428             push @info, "($parenthesized)" if $parenthesized;
15429
15430             if ($name =~ /_$/ && $alias->loose_match) {
15431                 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
15432             }
15433
15434             if ($table_property != $perl && $table->perl_extension) {
15435                 push @info, '(Perl extension)';
15436             }
15437             push @info, "($string_count)";
15438
15439             # Now, we have both the entry and info so add them to the
15440             # list of all the properties.
15441             push @match_properties,
15442                 format_pod_line($indent_info_column,
15443                                 $entry,
15444                                 join( " ", @info),
15445                                 $alias->status,
15446                                 $alias->loose_match);
15447
15448             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
15449         } # End of looping through the aliases for this table.
15450
15451         if (! $entry_for_first_table) {
15452             $entry_for_first_table = $entry_for_first_alias;
15453         }
15454     } # End of looping through all the related tables
15455     return;
15456 }
15457
15458 sub make_ucd_table_pod_entries {
15459     my $table = shift;
15460
15461     # Generate the entries for the UCD section of the pod for $table.  This
15462     # also calculates if names are ambiguous, so has to be called even if the
15463     # pod is not being output
15464
15465     my $short_name = $table->name;
15466     my $standard_short_name = standardize($short_name);
15467     my $full_name = $table->full_name;
15468     my $standard_full_name = standardize($full_name);
15469
15470     my $full_info = "";     # Text of info column for full-name entries
15471     my $other_info = "";    # Text of info column for short-name entries
15472     my $short_info = "";    # Text of info column for other entries
15473     my $meaning = "";       # Synonym of this table
15474
15475     my $property = ($table->isa('Property'))
15476                    ? $table
15477                    : $table->parent->property;
15478
15479     my $perl_extension = $table->perl_extension;
15480
15481     # Get the more official name for for perl extensions that aren't
15482     # stand-alone properties
15483     if ($perl_extension && $property != $table) {
15484         if ($property == $perl ||$property->type == $BINARY) {
15485             $meaning = $table->complete_name;
15486         }
15487         else {
15488             $meaning = $property->full_name . "=$full_name";
15489         }
15490     }
15491
15492     # There are three types of info column.  One for the short name, one for
15493     # the full name, and one for everything else.  They mostly are the same,
15494     # so initialize in the same loop.
15495     foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
15496         if ($perl_extension && $property != $table) {
15497
15498             # Add the synonymous name for the non-full name entries; and to
15499             # the full-name entry if it adds extra information
15500             if ($info_ref == \$other_info
15501                 || ($info_ref == \$short_info
15502                     && $standard_short_name ne $standard_full_name)
15503                 || standardize($meaning) ne $standard_full_name
15504             ) {
15505                 $$info_ref .= "$meaning.";
15506             }
15507         }
15508         elsif ($info_ref != \$full_info) {
15509
15510             # Otherwise, the non-full name columns include the full name
15511             $$info_ref .= $full_name;
15512         }
15513
15514         # And the full-name entry includes the short name, if different
15515         if ($info_ref == \$full_info
15516             && $standard_short_name ne $standard_full_name)
15517         {
15518             $full_info =~ s/\.\Z//;
15519             $full_info .= "  " if $full_info;
15520             $full_info .= "(Short: $short_name)";
15521         }
15522
15523         if ($table->perl_extension) {
15524             $$info_ref =~ s/\.\Z//;
15525             $$info_ref .= ".  " if $$info_ref;
15526             $$info_ref .= "(Perl extension)";
15527         }
15528     }
15529
15530     # Add any extra annotations to the full name entry
15531     foreach my $more_info ($table->description,
15532                             $table->note,
15533                             $table->status_info)
15534     {
15535         next unless $more_info;
15536         $full_info =~ s/\.\Z//;
15537         $full_info .= ".  " if $full_info;
15538         $full_info .= $more_info;
15539     }
15540
15541     # These keep track if have created full and short name pod entries for the
15542     # property
15543     my $done_full = 0;
15544     my $done_short = 0;
15545
15546     # Every possible name is kept track of, even those that aren't going to be
15547     # output.  This way we can be sure to find the ambiguities.
15548     foreach my $alias ($table->aliases) {
15549         my $name = $alias->name;
15550         my $standard = standardize($name);
15551         my $info;
15552         my $output_this = $alias->ucd;
15553
15554         # If the full and short names are the same, we want to output the full
15555         # one's entry, so it has priority.
15556         if ($standard eq $standard_full_name) {
15557             next if $done_full;
15558             $done_full = 1;
15559             $info = $full_info;
15560         }
15561         elsif ($standard eq $standard_short_name) {
15562             next if $done_short;
15563             $done_short = 1;
15564             next if $standard_short_name eq $standard_full_name;
15565             $info = $short_info;
15566         }
15567         else {
15568             $info = $other_info;
15569         }
15570
15571         # Here, we have set up the two columns for this entry.  But if an
15572         # entry already exists for this name, we have to decide which one
15573         # we're going to later output.
15574         if (exists $ucd_pod{$standard}) {
15575
15576             # If the two entries refer to the same property, it's not going to
15577             # be ambiguous.  (Likely it's because the names when standardized
15578             # are the same.)  But that means if they are different properties,
15579             # there is ambiguity.
15580             if ($ucd_pod{$standard}->{'property'} != $property) {
15581
15582                 # Here, we have an ambiguity.  This code assumes that one is
15583                 # scheduled to be output and one not and that one is a perl
15584                 # extension (which is not to be output) and the other isn't.
15585                 # If those assumptions are wrong, things have to be rethought.
15586                 if ($ucd_pod{$standard}{'output_this'} == $output_this
15587                     || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
15588                     || $output_this == $perl_extension)
15589                 {
15590                     Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
15591                 }
15592
15593                 # We modifiy the info column of the one being output to
15594                 # indicate the ambiguity.  Set $which to point to that one's
15595                 # info.
15596                 my $which;
15597                 if ($ucd_pod{$standard}{'output_this'}) {
15598                     $which = \$ucd_pod{$standard}->{'info'};
15599                 }
15600                 else {
15601                     $which = \$info;
15602                     $meaning = $ucd_pod{$standard}{'meaning'};
15603                 }
15604
15605                 chomp $$which;
15606                 $$which =~ s/\.\Z//;
15607                 $$which .= "; NOT '$standard' meaning '$meaning'";
15608
15609                 $ambiguous_names{$standard} = 1;
15610             }
15611
15612             # Use the non-perl-extension variant
15613             next unless $ucd_pod{$standard}{'perl_extension'};
15614         }
15615
15616         # Store enough information about this entry that we can later look for
15617         # ambiguities, and output it properly.
15618         $ucd_pod{$standard} = { 'name' => $name,
15619                                 'info' => $info,
15620                                 'meaning' => $meaning,
15621                                 'output_this' => $output_this,
15622                                 'perl_extension' => $perl_extension,
15623                                 'property' => $property,
15624                                 'status' => $alias->status,
15625         };
15626     } # End of looping through all this table's aliases
15627
15628     return;
15629 }
15630
15631 sub pod_alphanumeric_sort {
15632     # Sort pod entries alphanumerically.
15633
15634     # The first few character columns are filler, plus the '\p{'; and get rid
15635     # of all the trailing stuff, starting with the trailing '}', so as to sort
15636     # on just 'Name=Value'
15637     (my $a = lc $a) =~ s/^ .*? { //x;
15638     $a =~ s/}.*//;
15639     (my $b = lc $b) =~ s/^ .*? { //x;
15640     $b =~ s/}.*//;
15641
15642     # Determine if the two operands are both internal only or both not.
15643     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
15644     # should be the underscore that begins internal only
15645     my $a_is_internal = (substr($a, 0, 1) eq '_');
15646     my $b_is_internal = (substr($b, 0, 1) eq '_');
15647
15648     # Sort so the internals come last in the table instead of first (which the
15649     # leading underscore would otherwise indicate).
15650     if ($a_is_internal != $b_is_internal) {
15651         return 1 if $a_is_internal;
15652         return -1
15653     }
15654
15655     # Determine if the two operands are numeric property values or not.
15656     # A numeric property will look like xyz: 3.  But the number
15657     # can begin with an optional minus sign, and may have a
15658     # fraction or rational component, like xyz: 3/2.  If either
15659     # isn't numeric, use alphabetic sort.
15660     my ($a_initial, $a_number) =
15661         ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15662     return $a cmp $b unless defined $a_number;
15663     my ($b_initial, $b_number) =
15664         ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15665     return $a cmp $b unless defined $b_number;
15666
15667     # Here they are both numeric, but use alphabetic sort if the
15668     # initial parts don't match
15669     return $a cmp $b if $a_initial ne $b_initial;
15670
15671     # Convert rationals to floating for the comparison.
15672     $a_number = eval $a_number if $a_number =~ qr{/};
15673     $b_number = eval $b_number if $b_number =~ qr{/};
15674
15675     return $a_number <=> $b_number;
15676 }
15677
15678 sub make_pod () {
15679     # Create the .pod file.  This generates the various subsections and then
15680     # combines them in one big HERE document.
15681
15682     my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
15683
15684     return unless defined $pod_directory;
15685     print "Making pod file\n" if $verbosity >= $PROGRESS;
15686
15687     my $exception_message =
15688     '(Any exceptions are individually noted beginning with the word NOT.)';
15689     my @block_warning;
15690     if (-e 'Blocks.txt') {
15691
15692         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
15693         # if the global $has_In_conflicts indicates we have them.
15694         push @match_properties, format_pod_line($indent_info_column,
15695                                                 '\p{In_*}',
15696                                                 '\p{Block: *}'
15697                                                     . (($has_In_conflicts)
15698                                                       ? " $exception_message"
15699                                                       : ""));
15700         @block_warning = << "END";
15701
15702 Matches in the Block property have shortcuts that begin with "In_".  For
15703 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>.  For
15704 backward compatibility, if there is no conflict with another shortcut, these
15705 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>.  But, N.B., there
15706 are numerous such conflicting shortcuts.  Use of these forms for Block is
15707 discouraged, and are flagged as such, not only because of the potential
15708 confusion as to what is meant, but also because a later release of Unicode may
15709 preempt the shortcut, and your program would no longer be correct.  Use the
15710 "In_" form instead to avoid this, or even more clearly, use the compound form,
15711 e.g., C<\\p{blk:latin1}>.  See L<perlunicode/"Blocks"> for more information
15712 about this.
15713 END
15714     }
15715     my $text = $Is_flags_text;
15716     $text = "$exception_message $text" if $has_Is_conflicts;
15717
15718     # And the 'Is_ line';
15719     push @match_properties, format_pod_line($indent_info_column,
15720                                             '\p{Is_*}',
15721                                             "\\p{*} $text");
15722
15723     # Sort the properties array for output.  It is sorted alphabetically
15724     # except numerically for numeric properties, and only output unique lines.
15725     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
15726
15727     my $formatted_properties = simple_fold(\@match_properties,
15728                                         "",
15729                                         # indent succeeding lines by two extra
15730                                         # which looks better
15731                                         $indent_info_column + 2,
15732
15733                                         # shorten the line length by how much
15734                                         # the formatter indents, so the folded
15735                                         # line will fit in the space
15736                                         # presumably available
15737                                         $automatic_pod_indent);
15738     # Add column headings, indented to be a little more centered, but not
15739     # exactly
15740     $formatted_properties =  format_pod_line($indent_info_column,
15741                                                     '    NAME',
15742                                                     '           INFO')
15743                                     . "\n"
15744                                     . $formatted_properties;
15745
15746     # Generate pod documentation lines for the tables that match nothing
15747     my $zero_matches = "";
15748     if (@zero_match_tables) {
15749         @zero_match_tables = uniques(@zero_match_tables);
15750         $zero_matches = join "\n\n",
15751                         map { $_ = '=item \p{' . $_->complete_name . "}" }
15752                             sort { $a->complete_name cmp $b->complete_name }
15753                             @zero_match_tables;
15754
15755         $zero_matches = <<END;
15756
15757 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
15758
15759 Unicode has some property-value pairs that currently don't match anything.
15760 This happens generally either because they are obsolete, or they exist for
15761 symmetry with other forms, but no language has yet been encoded that uses
15762 them.  In this version of Unicode, the following match zero code points:
15763
15764 =over 4
15765
15766 $zero_matches
15767
15768 =back
15769
15770 END
15771     }
15772
15773     # Generate list of properties that we don't accept, grouped by the reasons
15774     # why.  This is so only put out the 'why' once, and then list all the
15775     # properties that have that reason under it.
15776
15777     my %why_list;   # The keys are the reasons; the values are lists of
15778                     # properties that have the key as their reason
15779
15780     # For each property, add it to the list that are suppressed for its reason
15781     # The sort will cause the alphabetically first properties to be added to
15782     # each list first, so each list will be sorted.
15783     foreach my $property (sort keys %why_suppressed) {
15784         push @{$why_list{$why_suppressed{$property}}}, $property;
15785     }
15786
15787     # For each reason (sorted by the first property that has that reason)...
15788     my @bad_re_properties;
15789     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
15790                      keys %why_list)
15791     {
15792         # Add to the output, all the properties that have that reason.
15793         my $has_item = 0;   # Flag if actually output anything.
15794         foreach my $name (@{$why_list{$why}}) {
15795
15796             # Split compound names into $property and $table components
15797             my $property = $name;
15798             my $table;
15799             if ($property =~ / (.*) = (.*) /x) {
15800                 $property = $1;
15801                 $table = $2;
15802             }
15803
15804             # This release of Unicode may not have a property that is
15805             # suppressed, so don't reference a non-existent one.
15806             $property = property_ref($property);
15807             next if ! defined $property;
15808
15809             # And since this list is only for match tables, don't list the
15810             # ones that don't have match tables.
15811             next if ! $property->to_create_match_tables;
15812
15813             # Find any abbreviation, and turn it into a compound name if this
15814             # is a property=value pair.
15815             my $short_name = $property->name;
15816             $short_name .= '=' . $property->table($table)->name if $table;
15817
15818             # Start with an empty line.
15819             push @bad_re_properties, "\n\n" unless $has_item;
15820
15821             # And add the property as an item for the reason.
15822             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
15823             $has_item = 1;
15824         }
15825
15826         # And add the reason under the list of properties, if such a list
15827         # actually got generated.  Note that the header got added
15828         # unconditionally before.  But pod ignores extra blank lines, so no
15829         # harm.
15830         push @bad_re_properties, "\n$why\n" if $has_item;
15831
15832     } # End of looping through each reason.
15833
15834     if (! @bad_re_properties) {
15835         push @bad_re_properties,
15836                 "*** This installation accepts ALL non-Unihan properties ***";
15837     }
15838     else {
15839         # Add =over only if non-empty to avoid an empty =over/=back section,
15840         # which is considered bad form.
15841         unshift @bad_re_properties, "\n=over 4\n";
15842         push @bad_re_properties, "\n=back\n";
15843     }
15844
15845     # Similiarly, generate a list of files that we don't use, grouped by the
15846     # reasons why.  First, create a hash whose keys are the reasons, and whose
15847     # values are anonymous arrays of all the files that share that reason.
15848     my %grouped_by_reason;
15849     foreach my $file (keys %ignored_files) {
15850         push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
15851     }
15852     foreach my $file (keys %skipped_files) {
15853         push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
15854     }
15855
15856     # Then, sort each group.
15857     foreach my $group (keys %grouped_by_reason) {
15858         @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
15859                                         @{$grouped_by_reason{$group}} ;
15860     }
15861
15862     # Finally, create the output text.  For each reason (sorted by the
15863     # alphabetically first file that has that reason)...
15864     my @unused_files;
15865     foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
15866                                cmp lc $grouped_by_reason{$b}->[0]
15867                               }
15868                          keys %grouped_by_reason)
15869     {
15870         # Add all the files that have that reason to the output.  Start
15871         # with an empty line.
15872         push @unused_files, "\n\n";
15873         push @unused_files, map { "\n=item F<$_> \n" }
15874                             @{$grouped_by_reason{$reason}};
15875         # And add the reason under the list of files
15876         push @unused_files, "\n$reason\n";
15877     }
15878
15879     # Similarly, create the output text for the UCD section of the pod
15880     my @ucd_pod;
15881     foreach my $key (keys %ucd_pod) {
15882         next unless $ucd_pod{$key}->{'output_this'};
15883         push @ucd_pod, format_pod_line($indent_info_column,
15884                                        $ucd_pod{$key}->{'name'},
15885                                        $ucd_pod{$key}->{'info'},
15886                                        $ucd_pod{$key}->{'status'},
15887                                       );
15888     }
15889
15890     # Sort alphabetically, and fold for output
15891     @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
15892     my $ucd_pod = simple_fold(\@ucd_pod,
15893                            ' ',
15894                            $indent_info_column,
15895                            $automatic_pod_indent);
15896     $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
15897                 . "\n"
15898                 . $ucd_pod;
15899     local $" = "";
15900
15901     # Everything is ready to assemble.
15902     my @OUT = << "END";
15903 =begin comment
15904
15905 $HEADER
15906
15907 To change this file, edit $0 instead.
15908
15909 =end comment
15910
15911 =head1 NAME
15912
15913 $pod_file - Index of Unicode Version $string_version character properties in Perl
15914
15915 =head1 DESCRIPTION
15916
15917 This document provides information about the portion of the Unicode database
15918 that deals with character properties, that is the portion that is defined on
15919 single code points.  (L</Other information in the Unicode data base>
15920 below briefly mentions other data that Unicode provides.)
15921
15922 Perl can provide access to all non-provisional Unicode character properties,
15923 though not all are enabled by default.  The omitted ones are the Unihan
15924 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
15925 deprecated or Unicode-internal properties.  (An installation may choose to
15926 recompile Perl's tables to change this.  See L<Unicode character
15927 properties that are NOT accepted by Perl>.)
15928
15929 For most purposes, access to Unicode properties from the Perl core is through
15930 regular expression matches, as described in the next section.
15931 For some special purposes, and to access the properties that are not suitable
15932 for regular expression matching, all the Unicode character properties that
15933 Perl handles are accessible via the standard L<Unicode::UCD> module, as
15934 described in the section L</Properties accessible through Unicode::UCD>.
15935
15936 Perl also provides some additional extensions and short-cut synonyms
15937 for Unicode properties.
15938
15939 This document merely lists all available properties and does not attempt to
15940 explain what each property really means.  There is a brief description of each
15941 Perl extension; see L<perlunicode/Other Properties> for more information on
15942 these.  There is some detail about Blocks, Scripts, General_Category,
15943 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
15944 official Unicode properties, refer to the Unicode standard.  A good starting
15945 place is L<$unicode_reference_url>.
15946
15947 Note that you can define your own properties; see
15948 L<perlunicode/"User-Defined Character Properties">.
15949
15950 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
15951
15952 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
15953 most of the Unicode character properties.  The table below shows all these
15954 constructs, both single and compound forms.
15955
15956 B<Compound forms> consist of two components, separated by an equals sign or a
15957 colon.  The first component is the property name, and the second component is
15958 the particular value of the property to match against, for example,
15959 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
15960 whose Script property value is Greek.
15961
15962 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
15963 their equivalent compound forms.  The table shows these equivalences.  (In our
15964 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
15965 There are also a few Perl-defined single forms that are not shortcuts for a
15966 compound form.  One such is C<\\p{Word}>.  These are also listed in the table.
15967
15968 In parsing these constructs, Perl always ignores Upper/lower case differences
15969 everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
15970 C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
15971 the left brace completely changes the meaning of the construct, from "match"
15972 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
15973 for improved legibility.
15974
15975 Also, white space, hyphens, and underscores are normally ignored
15976 everywhere between the {braces}, and hence can be freely added or removed
15977 even if the C</x> modifier hasn't been specified on the regular expression.
15978 But in the table below $a_bold_stricter at the beginning of an entry
15979 means that tighter (stricter) rules are used for that entry:
15980
15981 =over 4
15982
15983 =over 4
15984
15985 =item Single form (C<\\p{name}>) tighter rules:
15986
15987 White space, hyphens, and underscores ARE significant
15988 except for:
15989
15990 =over 4
15991
15992 =item * white space adjacent to a non-word character
15993
15994 =item * underscores separating digits in numbers
15995
15996 =back
15997
15998 That means, for example, that you can freely add or remove white space
15999 adjacent to (but within) the braces without affecting the meaning.
16000
16001 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
16002
16003 The tighter rules given above for the single form apply to everything to the
16004 right of the colon or equals; the looser rules still apply to everything to
16005 the left.
16006
16007 That means, for example, that you can freely add or remove white space
16008 adjacent to (but within) the braces and the colon or equal sign.
16009
16010 =back
16011
16012 =back
16013
16014 Some properties are considered obsolete by Unicode, but still available.
16015 There are several varieties of obsolescence:
16016
16017 =over 4
16018
16019 =over 4
16020
16021 =item Stabilized
16022
16023 A property may be stabilized.  Such a determination does not indicate
16024 that the property should or should not be used; instead it is a declaration
16025 that the property will not be maintained nor extended for newly encoded
16026 characters.  Such properties are marked with $a_bold_stabilized in the
16027 table.
16028
16029 =item Deprecated
16030
16031 A property may be deprecated, perhaps because its original intent
16032 has been replaced by another property, or because its specification was
16033 somehow defective.  This means that its use is strongly
16034 discouraged, so much so that a warning will be issued if used, unless the
16035 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
16036 statement.  $A_bold_deprecated flags each such entry in the table, and
16037 the entry there for the longest, most descriptive version of the property will
16038 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
16039 warning, even for properties that aren't officially deprecated by Unicode,
16040 when there used to be characters or code points that were matched by them, but
16041 no longer.  This is to warn you that your program may not work like it did on
16042 earlier Unicode releases.
16043
16044 A deprecated property may be made unavailable in a future Perl version, so it
16045 is best to move away from them.
16046
16047 A deprecated property may also be stabilized, but this fact is not shown.
16048
16049 =item Obsolete
16050
16051 Properties marked with $a_bold_obsolete in the table are considered (plain)
16052 obsolete.  Generally this designation is given to properties that Unicode once
16053 used for internal purposes (but not any longer).
16054
16055 =back
16056
16057 Some Perl extensions are present for backwards compatibility and are
16058 discouraged from being used, but are not obsolete.  $A_bold_discouraged
16059 flags each such entry in the table.  Future Unicode versions may force
16060 some of these extensions to be removed without warning, replaced by another
16061 property with the same name that means something different.  Use the
16062 equivalent shown instead.
16063
16064 =back
16065
16066 @block_warning
16067
16068 The table below has two columns.  The left column contains the C<\\p{}>
16069 constructs to look up, possibly preceded by the flags mentioned above; and
16070 the right column contains information about them, like a description, or
16071 synonyms.  The table shows both the single and compound forms for each
16072 property that has them.  If the left column is a short name for a property,
16073 the right column will give its longer, more descriptive name; and if the left
16074 column is the longest name, the right column will show any equivalent shortest
16075 name, in both single and compound forms if applicable.
16076
16077 The right column will also caution you if a property means something different
16078 than what might normally be expected.
16079
16080 All single forms are Perl extensions; a few compound forms are as well, and
16081 are noted as such.
16082
16083 Numbers in (parentheses) indicate the total number of Unicode code points
16084 matched by the property.  For emphasis, those properties that match no code
16085 points at all are listed as well in a separate section following the table.
16086
16087 Most properties match the same code points regardless of whether C<"/i">
16088 case-insensitive matching is specified or not.  But a few properties are
16089 affected.  These are shown with the notation S<C<(/i= I<other_property>)>>
16090 in the second column.  Under case-insensitive matching they match the
16091 same code pode points as the property I<other_property>.
16092
16093 There is no description given for most non-Perl defined properties (See
16094 L<$unicode_reference_url> for that).
16095
16096 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
16097 combinations.  For example, entries like:
16098
16099  \\p{Gc: *}                                  \\p{General_Category: *}
16100
16101 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
16102 for the latter is also valid for the former.  Similarly,
16103
16104  \\p{Is_*}                                   \\p{*}
16105
16106 means that if and only if, for example, C<\\p{Foo}> exists, then
16107 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
16108 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
16109 C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
16110 underscore.
16111
16112 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
16113 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
16114 'N*' to indicate this, and doesn't have separate entries for the other
16115 possibilities.  Note that not all properties which have values 'Yes' and 'No'
16116 are binary, and they have all their values spelled out without using this wild
16117 card, and a C<NOT> clause in their description that highlights their not being
16118 binary.  These also require the compound form to match them, whereas true
16119 binary properties have both single and compound forms available.
16120
16121 Note that all non-essential underscores are removed in the display of the
16122 short names below.
16123
16124 B<Legend summary:>
16125
16126 =over 4
16127
16128 =item *
16129
16130 B<*> is a wild-card
16131
16132 =item *
16133
16134 B<(\\d+)> in the info column gives the number of Unicode code points matched
16135 by this property.
16136
16137 =item *
16138
16139 B<$DEPRECATED> means this is deprecated.
16140
16141 =item *
16142
16143 B<$OBSOLETE> means this is obsolete.
16144
16145 =item *
16146
16147 B<$STABILIZED> means this is stabilized.
16148
16149 =item *
16150
16151 B<$STRICTER> means tighter (stricter) name matching applies.
16152
16153 =item *
16154
16155 B<$DISCOURAGED> means use of this form is discouraged, and may not be
16156 stable.
16157
16158 =back
16159
16160 $formatted_properties
16161
16162 $zero_matches
16163
16164 =head1 Properties accessible through Unicode::UCD
16165
16166 All the Unicode character properties mentioned above (except for those marked
16167 as for internal use by Perl) are also accessible by
16168 L<Unicode::UCD/prop_invlist()>.
16169
16170 Due to their nature, not all Unicode character properties are suitable for
16171 regular expression matches, nor C<prop_invlist()>.  The remaining
16172 non-provisional, non-internal ones are accessible via
16173 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
16174 hasn't included; see L<below for which those are|/Unicode character properties
16175 that are NOT accepted by Perl>).
16176
16177 For compatibility with other parts of Perl, all the single forms given in the
16178 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
16179 are recognized.  BUT, there are some ambiguities between some Perl extensions
16180 and the Unicode properties, all of which are silently resolved in favor of the
16181 official Unicode property.  To avoid surprises, you should only use
16182 C<prop_invmap()> for forms listed in the table below, which omits the
16183 non-recommended ones.  The affected forms are the Perl single form equivalents
16184 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
16185 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
16186 whose short name is C<sc>.  The table indicates the current ambiguities in the
16187 INFO column, beginning with the word C<"NOT">.
16188
16189 The standard Unicode properties listed below are documented in
16190 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
16191 L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
16192 L<perlunicode/Other Properties>;
16193
16194 The first column in the table is a name for the property; the second column is
16195 an alternative name, if any, plus possibly some annotations.  The alternative
16196 name is the property's full name, unless that would simply repeat the first
16197 column, in which case the second column indicates the property's short name
16198 (if different).  The annotations are given only in the entry for the full
16199 name.  If a property is obsolete, etc, the entry will be flagged with the same
16200 characters used in the table in the L<section above|/Properties accessible
16201 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
16202
16203 $ucd_pod
16204
16205 =head1 Properties accessible through other means
16206
16207 Certain properties are accessible also via core function calls.  These are:
16208
16209  Lowercase_Mapping          lc() and lcfirst()
16210  Titlecase_Mapping          ucfirst()
16211  Uppercase_Mapping          uc()
16212
16213 Also, Case_Folding is accessible through the C</i> modifier in regular
16214 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
16215 operator.
16216
16217 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
16218 interpolation in double-quoted strings and regular expressions; and functions
16219 C<charnames::viacode()>, C<charnames::vianame()>, and
16220 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
16221 specified.
16222
16223 Finally, most properties related to decomposition are accessible via
16224 L<Unicode::Normalize>.
16225
16226 =head1 Unicode character properties that are NOT accepted by Perl
16227
16228 Perl will generate an error for a few character properties in Unicode when
16229 used in a regular expression.  The non-Unihan ones are listed below, with the
16230 reasons they are not accepted, perhaps with work-arounds.  The short names for
16231 the properties are listed enclosed in (parentheses).
16232 As described after the list, an installation can change the defaults and choose
16233 to accept any of these.  The list is machine generated based on the
16234 choices made for the installation that generated this document.
16235
16236 @bad_re_properties
16237
16238 An installation can choose to allow any of these to be matched by downloading
16239 the Unicode database from L<http://www.unicode.org/Public/> to
16240 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
16241 controlling lists contained in the program
16242 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
16243 (C<\%Config> is available from the Config module).
16244
16245 =head1 Other information in the Unicode data base
16246
16247 The Unicode data base is delivered in two different formats.  The XML version
16248 is valid for more modern Unicode releases.  The other version is a collection
16249 of files.  The two are intended to give equivalent information.  Perl uses the
16250 older form; this allows you to recompile Perl to use early Unicode releases.
16251
16252 The only non-character property that Perl currently supports is Named
16253 Sequences, in which a sequence of code points
16254 is given a name and generally treated as a single entity.  (Perl supports
16255 these via the C<\\N{...}> double-quotish construct,
16256 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
16257
16258 Below is a list of the files in the Unicode data base that Perl doesn't
16259 currently use, along with very brief descriptions of their purposes.
16260 Some of the names of the files have been shortened from those that Unicode
16261 uses, in order to allow them to be distinguishable from similarly named files
16262 on file systems for which only the first 8 characters of a name are
16263 significant.
16264
16265 =over 4
16266
16267 @unused_files
16268
16269 =back
16270
16271 =head1 SEE ALSO
16272
16273 L<$unicode_reference_url>
16274
16275 L<perlrecharclass>
16276
16277 L<perlunicode>
16278
16279 END
16280
16281     # And write it.  The 0 means no utf8.
16282     main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
16283     return;
16284 }
16285
16286 sub make_Heavy () {
16287     # Create and write Heavy.pl, which passes info about the tables to
16288     # utf8_heavy.pl
16289
16290     # Stringify structures for output
16291     my $loose_property_name_of
16292                            = simple_dumper(\%loose_property_name_of, ' ' x 4);
16293     chomp $loose_property_name_of;
16294
16295     my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
16296     chomp $stricter_to_file_of;
16297
16298     my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
16299     chomp $loose_to_file_of;
16300
16301     my $nv_floating_to_rational
16302                            = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
16303     chomp $nv_floating_to_rational;
16304
16305     my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
16306     chomp $why_deprecated;
16307
16308     # We set the key to the file when we associated files with tables, but we
16309     # couldn't do the same for the value then, as we might not have the file
16310     # for the alternate table figured out at that time.
16311     foreach my $cased (keys %caseless_equivalent_to) {
16312         my @path = $caseless_equivalent_to{$cased}->file_path;
16313         my $path = join '/', @path[1, -1];
16314         $caseless_equivalent_to{$cased} = $path;
16315     }
16316     my $caseless_equivalent_to
16317                            = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
16318     chomp $caseless_equivalent_to;
16319
16320     my $loose_property_to_file_of
16321                         = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
16322     chomp $loose_property_to_file_of;
16323
16324     my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
16325     chomp $file_to_swash_name;
16326
16327     my @heavy = <<END;
16328 $HEADER
16329 $INTERNAL_ONLY_HEADER
16330
16331 # This file is for the use of utf8_heavy.pl and Unicode::UCD
16332
16333 # Maps Unicode (not Perl single-form extensions) property names in loose
16334 # standard form to their corresponding standard names
16335 \%utf8::loose_property_name_of = (
16336 $loose_property_name_of
16337 );
16338
16339 # Maps property, table to file for those using stricter matching
16340 \%utf8::stricter_to_file_of = (
16341 $stricter_to_file_of
16342 );
16343
16344 # Maps property, table to file for those using loose matching
16345 \%utf8::loose_to_file_of = (
16346 $loose_to_file_of
16347 );
16348
16349 # Maps floating point to fractional form
16350 \%utf8::nv_floating_to_rational = (
16351 $nv_floating_to_rational
16352 );
16353
16354 # If a floating point number doesn't have enough digits in it to get this
16355 # close to a fraction, it isn't considered to be that fraction even if all the
16356 # digits it does have match.
16357 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
16358
16359 # Deprecated tables to generate a warning for.  The key is the file containing
16360 # the table, so as to avoid duplication, as many property names can map to the
16361 # file, but we only need one entry for all of them.
16362 \%utf8::why_deprecated = (
16363 $why_deprecated
16364 );
16365
16366 # A few properties have different behavior under /i matching.  This maps
16367 # those to substitute files to use under /i.
16368 \%utf8::caseless_equivalent = (
16369 $caseless_equivalent_to
16370 );
16371
16372 # Property names to mapping files
16373 \%utf8::loose_property_to_file_of = (
16374 $loose_property_to_file_of
16375 );
16376
16377 # Files to the swash names within them.
16378 \%utf8::file_to_swash_name = (
16379 $file_to_swash_name
16380 );
16381
16382 1;
16383 END
16384
16385     main::write("Heavy.pl", 0, \@heavy);  # The 0 means no utf8.
16386     return;
16387 }
16388
16389 sub make_Name_pm () {
16390     # Create and write Name.pm, which contains subroutines and data to use in
16391     # conjunction with Name.pl
16392
16393     # Maybe there's nothing to do.
16394     return unless $has_hangul_syllables || @code_points_ending_in_code_point;
16395
16396     my @name = <<END;
16397 $HEADER
16398 $INTERNAL_ONLY_HEADER
16399 END
16400
16401     # Convert these structures to output format.
16402     my $code_points_ending_in_code_point =
16403         main::simple_dumper(\@code_points_ending_in_code_point,
16404                             ' ' x 8);
16405     my $names = main::simple_dumper(\%names_ending_in_code_point,
16406                                     ' ' x 8);
16407     my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
16408                                     ' ' x 8);
16409
16410     # Do the same with the Hangul names,
16411     my $jamo;
16412     my $jamo_l;
16413     my $jamo_v;
16414     my $jamo_t;
16415     my $jamo_re;
16416     if ($has_hangul_syllables) {
16417
16418         # Construct a regular expression of all the possible
16419         # combinations of the Hangul syllables.
16420         my @L_re;   # Leading consonants
16421         for my $i ($LBase .. $LBase + $LCount - 1) {
16422             push @L_re, $Jamo{$i}
16423         }
16424         my @V_re;   # Middle vowels
16425         for my $i ($VBase .. $VBase + $VCount - 1) {
16426             push @V_re, $Jamo{$i}
16427         }
16428         my @T_re;   # Trailing consonants
16429         for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
16430             push @T_re, $Jamo{$i}
16431         }
16432
16433         # The whole re is made up of the L V T combination.
16434         $jamo_re = '('
16435                     . join ('|', sort @L_re)
16436                     . ')('
16437                     . join ('|', sort @V_re)
16438                     . ')('
16439                     . join ('|', sort @T_re)
16440                     . ')?';
16441
16442         # These hashes needed by the algorithm were generated
16443         # during reading of the Jamo.txt file
16444         $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
16445         $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
16446         $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
16447         $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
16448     }
16449
16450     push @name, <<END;
16451
16452 package charnames;
16453
16454 # This module contains machine-generated tables and code for the
16455 # algorithmically-determinable Unicode character names.  The following
16456 # routines can be used to translate between name and code point and vice versa
16457
16458 { # Closure
16459
16460     # Matches legal code point.  4-6 hex numbers, If there are 6, the first
16461     # two must be 10; if there are 5, the first must not be a 0.  Written this
16462     # way to decrease backtracking.  The first regex allows the code point to
16463     # be at the end of a word, but to work properly, the word shouldn't end
16464     # with a valid hex character.  The second one won't match a code point at
16465     # the end of a word, and doesn't have the run-on issue
16466     my \$run_on_code_point_re = qr/$run_on_code_point_re/;
16467     my \$code_point_re = qr/$code_point_re/;
16468
16469     # In the following hash, the keys are the bases of names which include
16470     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
16471     # of each key is another hash which is used to get the low and high ends
16472     # for each range of code points that apply to the name.
16473     my %names_ending_in_code_point = (
16474 $names
16475     );
16476
16477     # The following hash is a copy of the previous one, except is for loose
16478     # matching, so each name has blanks and dashes squeezed out
16479     my %loose_names_ending_in_code_point = (
16480 $loose_names
16481     );
16482
16483     # And the following array gives the inverse mapping from code points to
16484     # names.  Lowest code points are first
16485     my \@code_points_ending_in_code_point = (
16486 $code_points_ending_in_code_point
16487     );
16488 END
16489     # Earlier releases didn't have Jamos.  No sense outputting
16490     # them unless will be used.
16491     if ($has_hangul_syllables) {
16492         push @name, <<END;
16493
16494     # Convert from code point to Jamo short name for use in composing Hangul
16495     # syllable names
16496     my %Jamo = (
16497 $jamo
16498     );
16499
16500     # Leading consonant (can be null)
16501     my %Jamo_L = (
16502 $jamo_l
16503     );
16504
16505     # Vowel
16506     my %Jamo_V = (
16507 $jamo_v
16508     );
16509
16510     # Optional trailing consonant
16511     my %Jamo_T = (
16512 $jamo_t
16513     );
16514
16515     # Computed re that splits up a Hangul name into LVT or LV syllables
16516     my \$syllable_re = qr/$jamo_re/;
16517
16518     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
16519     my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
16520
16521     # These constants names and values were taken from the Unicode standard,
16522     # version 5.1, section 3.12.  They are used in conjunction with Hangul
16523     # syllables
16524     my \$SBase = $SBase_string;
16525     my \$LBase = $LBase_string;
16526     my \$VBase = $VBase_string;
16527     my \$TBase = $TBase_string;
16528     my \$SCount = $SCount;
16529     my \$LCount = $LCount;
16530     my \$VCount = $VCount;
16531     my \$TCount = $TCount;
16532     my \$NCount = \$VCount * \$TCount;
16533 END
16534     } # End of has Jamos
16535
16536     push @name, << 'END';
16537
16538     sub name_to_code_point_special {
16539         my ($name, $loose) = @_;
16540
16541         # Returns undef if not one of the specially handled names; otherwise
16542         # returns the code point equivalent to the input name
16543         # $loose is non-zero if to use loose matching, 'name' in that case
16544         # must be input as upper case with all blanks and dashes squeezed out.
16545 END
16546     if ($has_hangul_syllables) {
16547         push @name, << 'END';
16548
16549         if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
16550             || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
16551         {
16552             return if $name !~ qr/^$syllable_re$/;
16553             my $L = $Jamo_L{$1};
16554             my $V = $Jamo_V{$2};
16555             my $T = (defined $3) ? $Jamo_T{$3} : 0;
16556             return ($L * $VCount + $V) * $TCount + $T + $SBase;
16557         }
16558 END
16559     }
16560     push @name, << 'END';
16561
16562         # Name must end in 'code_point' for this to handle.
16563         return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
16564                    || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
16565
16566         my $base = $1;
16567         my $code_point = CORE::hex $2;
16568         my $names_ref;
16569
16570         if ($loose) {
16571             $names_ref = \%loose_names_ending_in_code_point;
16572         }
16573         else {
16574             return if $base !~ s/-$//;
16575             $names_ref = \%names_ending_in_code_point;
16576         }
16577
16578         # Name must be one of the ones which has the code point in it.
16579         return if ! $names_ref->{$base};
16580
16581         # Look through the list of ranges that apply to this name to see if
16582         # the code point is in one of them.
16583         for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
16584             return if $names_ref->{$base}{'low'}->[$i] > $code_point;
16585             next if $names_ref->{$base}{'high'}->[$i] < $code_point;
16586
16587             # Here, the code point is in the range.
16588             return $code_point;
16589         }
16590
16591         # Here, looked like the name had a code point number in it, but
16592         # did not match one of the valid ones.
16593         return;
16594     }
16595
16596     sub code_point_to_name_special {
16597         my $code_point = shift;
16598
16599         # Returns the name of a code point if algorithmically determinable;
16600         # undef if not
16601 END
16602     if ($has_hangul_syllables) {
16603         push @name, << 'END';
16604
16605         # If in the Hangul range, calculate the name based on Unicode's
16606         # algorithm
16607         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
16608             use integer;
16609             my $SIndex = $code_point - $SBase;
16610             my $L = $LBase + $SIndex / $NCount;
16611             my $V = $VBase + ($SIndex % $NCount) / $TCount;
16612             my $T = $TBase + $SIndex % $TCount;
16613             $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
16614             $name .= $Jamo{$T} if $T != $TBase;
16615             return $name;
16616         }
16617 END
16618     }
16619     push @name, << 'END';
16620
16621         # Look through list of these code points for one in range.
16622         foreach my $hash (@code_points_ending_in_code_point) {
16623             return if $code_point < $hash->{'low'};
16624             if ($code_point <= $hash->{'high'}) {
16625                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
16626             }
16627         }
16628         return;            # None found
16629     }
16630 } # End closure
16631
16632 1;
16633 END
16634
16635     main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
16636     return;
16637 }
16638
16639 sub make_UCD () {
16640     # Create and write UCD.pl, which passes info about the tables to
16641     # Unicode::UCD
16642
16643     # Create a mapping from each alias of Perl single-form extensions to all
16644     # its equivalent aliases, for quick look-up.
16645     my %perlprop_to_aliases;
16646     foreach my $table ($perl->tables) {
16647
16648         # First create the list of the aliases of each extension
16649         my @aliases_list;    # List of legal aliases for this extension
16650
16651         my $table_name = $table->name;
16652         my $standard_table_name = standardize($table_name);
16653         my $table_full_name = $table->full_name;
16654         my $standard_table_full_name = standardize($table_full_name);
16655
16656         # Make sure that the list has both the short and full names
16657         push @aliases_list, $table_name, $table_full_name;
16658
16659         my $found_ucd = 0;  # ? Did we actually get an alias that should be
16660                             # output for this table
16661
16662         # Go through all the aliases (including the two just added), and add
16663         # any new unique ones to the list
16664         foreach my $alias ($table->aliases) {
16665
16666             # Skip non-legal names
16667             next unless $alias->ok_as_filename;
16668             next unless $alias->ucd;
16669
16670             $found_ucd = 1;     # have at least one legal name
16671
16672             my $name = $alias->name;
16673             my $standard = standardize($name);
16674
16675             # Don't repeat a name that is equivalent to one already on the
16676             # list
16677             next if $standard eq $standard_table_name;
16678             next if $standard eq $standard_table_full_name;
16679
16680             push @aliases_list, $name;
16681         }
16682
16683         # If there were no legal names, don't output anything.
16684         next unless $found_ucd;
16685
16686         # To conserve memory in the program reading these in, omit full names
16687         # that are identical to the short name, when those are the only two
16688         # aliases for the property.
16689         if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
16690             pop @aliases_list;
16691         }
16692
16693         # Here, @aliases_list is the list of all the aliases that this
16694         # extension legally has.  Now can create a map to it from each legal
16695         # standardized alias
16696         foreach my $alias ($table->aliases) {
16697             next unless $alias->ucd;
16698             next unless $alias->ok_as_filename;
16699             push @{$perlprop_to_aliases{standardize($alias->name)}},
16700                  @aliases_list;
16701         }
16702     }
16703
16704     # Make a list of all combinations of properties/values that are suppressed.
16705     my @suppressed;
16706     if (! $debug_skip) {    # This tends to fail in this debug mode
16707         foreach my $property_name (keys %why_suppressed) {
16708
16709             # Just the value
16710             my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
16711
16712             # The hash may contain properties not in this release of Unicode
16713             next unless defined (my $property = property_ref($property_name));
16714
16715             # Find all combinations
16716             foreach my $prop_alias ($property->aliases) {
16717                 my $prop_alias_name = standardize($prop_alias->name);
16718
16719                 # If no =value, there's just one combination possibe for this
16720                 if (! $value_name) {
16721
16722                     # The property may be suppressed, but there may be a proxy
16723                     # for it, so it shouldn't be listed as suppressed
16724                     next if $prop_alias->ucd;
16725                     push @suppressed, $prop_alias_name;
16726                 }
16727                 else {  # Otherwise
16728                     foreach my $value_alias
16729                                     ($property->table($value_name)->aliases)
16730                     {
16731                         next if $value_alias->ucd;
16732
16733                         push @suppressed, "$prop_alias_name="
16734                                         .  standardize($value_alias->name);
16735                     }
16736                 }
16737             }
16738         }
16739     }
16740     @suppressed = sort @suppressed; # So doesn't change between runs of this
16741                                     # program
16742
16743     # Convert the structure below (designed for Name.pm) to a form that UCD
16744     # wants, so it doesn't have to modify it at all; i.e. so that it includes
16745     # an element for the Hangul syllables in the appropriate place, and
16746     # otherwise changes the name to include the "-<code point>" suffix.
16747     my @algorithm_names;
16748     my $done_hangul = 0;
16749
16750     # Copy it linearly.
16751     for my $i (0 .. @code_points_ending_in_code_point - 1) {
16752
16753         # Insert the hanguls in the correct place.
16754         if (! $done_hangul
16755             && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
16756         {
16757             $done_hangul = 1;
16758             push @algorithm_names, { low => $SBase,
16759                                      high => $SBase + $SCount - 1,
16760                                      name => '<hangul syllable>',
16761                                     };
16762         }
16763
16764         # Copy the current entry, modified.
16765         push @algorithm_names, {
16766             low => $code_points_ending_in_code_point[$i]->{'low'},
16767             high => $code_points_ending_in_code_point[$i]->{'high'},
16768             name =>
16769                "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
16770         };
16771     }
16772
16773     # Serialize these structures for output.
16774     my $loose_to_standard_value
16775                           = simple_dumper(\%loose_to_standard_value, ' ' x 4);
16776     chomp $loose_to_standard_value;
16777
16778     my $string_property_loose_to_name
16779                     = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
16780     chomp $string_property_loose_to_name;
16781
16782     my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
16783     chomp $perlprop_to_aliases;
16784
16785     my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
16786     chomp $prop_aliases;
16787
16788     my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
16789     chomp $prop_value_aliases;
16790
16791     my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
16792     chomp $suppressed;
16793
16794     my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
16795     chomp $algorithm_names;
16796
16797     my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
16798     chomp $ambiguous_names;
16799
16800     my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
16801     chomp $loose_defaults;
16802
16803     my @ucd = <<END;
16804 $HEADER
16805 $INTERNAL_ONLY_HEADER
16806
16807 # This file is for the use of Unicode::UCD
16808
16809 # Highest legal Unicode code point
16810 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
16811
16812 # Hangul syllables
16813 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
16814 \$Unicode::UCD::HANGUL_COUNT = $SCount;
16815
16816 # Keys are all the possible "prop=value" combinations, in loose form; values
16817 # are the standard loose name for the 'value' part of the key
16818 \%Unicode::UCD::loose_to_standard_value = (
16819 $loose_to_standard_value
16820 );
16821
16822 # String property loose names to standard loose name
16823 \%Unicode::UCD::string_property_loose_to_name = (
16824 $string_property_loose_to_name
16825 );
16826
16827 # Keys are Perl extensions in loose form; values are each one's list of
16828 # aliases
16829 \%Unicode::UCD::loose_perlprop_to_name = (
16830 $perlprop_to_aliases
16831 );
16832
16833 # Keys are standard property name; values are each one's aliases
16834 \%Unicode::UCD::prop_aliases = (
16835 $prop_aliases
16836 );
16837
16838 # Keys of top level are standard property name; values are keys to another
16839 # hash,  Each one is one of the property's values, in standard form.  The
16840 # values are that prop-val's aliases.  If only one specified, the short and
16841 # long alias are identical.
16842 \%Unicode::UCD::prop_value_aliases = (
16843 $prop_value_aliases
16844 );
16845
16846 # Ordered (by code point ordinal) list of the ranges of code points whose
16847 # names are algorithmically determined.  Each range entry is an anonymous hash
16848 # of the start and end points and a template for the names within it.
16849 \@Unicode::UCD::algorithmic_named_code_points = (
16850 $algorithm_names
16851 );
16852
16853 # The properties that as-is have two meanings, and which must be disambiguated
16854 \%Unicode::UCD::ambiguous_names = (
16855 $ambiguous_names
16856 );
16857
16858 # Keys are the prop-val combinations which are the default values for the
16859 # given property, expressed in standard loose form
16860 \%Unicode::UCD::loose_defaults = (
16861 $loose_defaults
16862 );
16863
16864 # All combinations of names that are suppressed.
16865 # This is actually for UCD.t, so it knows which properties shouldn't have
16866 # entries.  If it got any bigger, would probably want to put it in its own
16867 # file to use memory only when it was needed, in testing.
16868 \@Unicode::UCD::suppressed_properties = (
16869 $suppressed
16870 );
16871
16872 1;
16873 END
16874
16875     main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
16876     return;
16877 }
16878
16879 sub write_all_tables() {
16880     # Write out all the tables generated by this program to files, as well as
16881     # the supporting data structures, pod file, and .t file.
16882
16883     my @writables;              # List of tables that actually get written
16884     my %match_tables_to_write;  # Used to collapse identical match tables
16885                                 # into one file.  Each key is a hash function
16886                                 # result to partition tables into buckets.
16887                                 # Each value is an array of the tables that
16888                                 # fit in the bucket.
16889
16890     # For each property ...
16891     # (sort so that if there is an immutable file name, it has precedence, so
16892     # some other property can't come in and take over its file name.  (We
16893     # don't care if both defined, as they had better be different anyway.)
16894     # The property named 'Perl' needs to be first (it doesn't have any
16895     # immutable file name) because empty properties are defined in terms of
16896     # it's table named 'All'.)   We also sort by the property's name.  This is
16897     # just for repeatability of the outputs between runs of this program, but
16898     # does not affect correctness.
16899     PROPERTY:
16900     foreach my $property ($perl,
16901                           sort { return -1 if defined $a->file;
16902                                  return 1 if defined $b->file;
16903                                  return $a->name cmp $b->name;
16904                                 } grep { $_ != $perl } property_ref('*'))
16905     {
16906         my $type = $property->type;
16907
16908         # And for each table for that property, starting with the mapping
16909         # table for it ...
16910         TABLE:
16911         foreach my $table($property,
16912
16913                         # and all the match tables for it (if any), sorted so
16914                         # the ones with the shortest associated file name come
16915                         # first.  The length sorting prevents problems of a
16916                         # longer file taking a name that might have to be used
16917                         # by a shorter one.  The alphabetic sorting prevents
16918                         # differences between releases
16919                         sort {  my $ext_a = $a->external_name;
16920                                 return 1 if ! defined $ext_a;
16921                                 my $ext_b = $b->external_name;
16922                                 return -1 if ! defined $ext_b;
16923
16924                                 # But return the non-complement table before
16925                                 # the complement one, as the latter is defined
16926                                 # in terms of the former, and needs to have
16927                                 # the information for the former available.
16928                                 return 1 if $a->complement != 0;
16929                                 return -1 if $b->complement != 0;
16930
16931                                 # Similarly, return a subservient table after
16932                                 # a leader
16933                                 return 1 if $a->leader != $a;
16934                                 return -1 if $b->leader != $b;
16935
16936                                 my $cmp = length $ext_a <=> length $ext_b;
16937
16938                                 # Return result if lengths not equal
16939                                 return $cmp if $cmp;
16940
16941                                 # Alphabetic if lengths equal
16942                                 return $ext_a cmp $ext_b
16943                         } $property->tables
16944                     )
16945         {
16946
16947             # Here we have a table associated with a property.  It could be
16948             # the map table (done first for each property), or one of the
16949             # other tables.  Determine which type.
16950             my $is_property = $table->isa('Property');
16951
16952             my $name = $table->name;
16953             my $complete_name = $table->complete_name;
16954
16955             # See if should suppress the table if is empty, but warn if it
16956             # contains something.
16957             my $suppress_if_empty_warn_if_not
16958                     = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
16959
16960             # Calculate if this table should have any code points associated
16961             # with it or not.
16962             my $expected_empty =
16963
16964                 # $perl should be empty, as well as properties that we just
16965                 # don't do anything with
16966                 ($is_property
16967                     && ($table == $perl
16968                         || grep { $complete_name eq $_ }
16969                                                     @unimplemented_properties
16970                     )
16971                 )
16972
16973                 # Match tables in properties we skipped populating should be
16974                 # empty
16975                 || (! $is_property && ! $property->to_create_match_tables)
16976
16977                 # Tables and properties that are expected to have no code
16978                 # points should be empty
16979                 || $suppress_if_empty_warn_if_not
16980             ;
16981
16982             # Set a boolean if this table is the complement of an empty binary
16983             # table
16984             my $is_complement_of_empty_binary =
16985                 $type == $BINARY &&
16986                 (($table == $property->table('Y')
16987                     && $property->table('N')->is_empty)
16988                 || ($table == $property->table('N')
16989                     && $property->table('Y')->is_empty));
16990
16991             if ($table->is_empty) {
16992
16993                 if ($suppress_if_empty_warn_if_not) {
16994                     $table->set_fate($SUPPRESSED,
16995                                      $suppress_if_empty_warn_if_not);
16996                 }
16997
16998                 # Suppress (by skipping them) expected empty tables.
16999                 next TABLE if $expected_empty;
17000
17001                 # And setup to later output a warning for those that aren't
17002                 # known to be allowed to be empty.  Don't do the warning if
17003                 # this table is a child of another one to avoid duplicating
17004                 # the warning that should come from the parent one.
17005                 if (($table == $property || $table->parent == $table)
17006                     && $table->fate != $SUPPRESSED
17007                     && $table->fate != $MAP_PROXIED
17008                     && ! grep { $complete_name =~ /^$_$/ }
17009                                                     @tables_that_may_be_empty)
17010                 {
17011                     push @unhandled_properties, "$table";
17012                 }
17013
17014                 # An empty table is just the complement of everything.
17015                 $table->set_complement($All) if $table != $property;
17016             }
17017             elsif ($expected_empty) {
17018                 my $because = "";
17019                 if ($suppress_if_empty_warn_if_not) {
17020                     $because = " because $suppress_if_empty_warn_if_not";
17021                 }
17022
17023                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
17024             }
17025
17026             # Some tables should match everything
17027             my $expected_full =
17028                 ($table->fate == $SUPPRESSED)
17029                 ? 0
17030                 : ($is_property)
17031                   ? # All these types of map tables will be full because
17032                     # they will have been populated with defaults
17033                     ($type == $ENUM || $type == $FORCED_BINARY)
17034
17035                   : # A match table should match everything if its method
17036                     # shows it should
17037                     ($table->matches_all
17038
17039                     # The complement of an empty binary table will match
17040                     # everything
17041                     || $is_complement_of_empty_binary
17042                     )
17043             ;
17044
17045             my $count = $table->count;
17046             if ($expected_full) {
17047                 if ($count != $MAX_WORKING_CODEPOINTS) {
17048                     Carp::my_carp("$table matches only "
17049                     . clarify_number($count)
17050                     . " Unicode code points but should match "
17051                     . clarify_number($MAX_WORKING_CODEPOINTS)
17052                     . " (off by "
17053                     .  clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
17054                     . ").  Proceeding anyway.");
17055                 }
17056
17057                 # Here is expected to be full.  If it is because it is the
17058                 # complement of an (empty) binary table that is to be
17059                 # suppressed, then suppress this one as well.
17060                 if ($is_complement_of_empty_binary) {
17061                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
17062                     my $opposing = $property->table($opposing_name);
17063                     my $opposing_status = $opposing->status;
17064                     if ($opposing_status) {
17065                         $table->set_status($opposing_status,
17066                                            $opposing->status_info);
17067                     }
17068                 }
17069             }
17070             elsif ($count == $MAX_UNICODE_CODEPOINTS
17071                    && $name ne "Any"
17072                    && ($table == $property || $table->leader == $table)
17073                    && $table->property->status ne $NORMAL)
17074             {
17075                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
17076             }
17077
17078             if ($table->fate >= $SUPPRESSED) {
17079                 if (! $is_property) {
17080                     my @children = $table->children;
17081                     foreach my $child (@children) {
17082                         if ($child->fate < $SUPPRESSED) {
17083                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
17084                         }
17085                     }
17086                 }
17087                 next TABLE;
17088
17089             }
17090
17091             if (! $is_property) {
17092
17093                 make_ucd_table_pod_entries($table) if $table->property == $perl;
17094
17095                 # Several things need to be done just once for each related
17096                 # group of match tables.  Do them on the parent.
17097                 if ($table->parent == $table) {
17098
17099                     # Add an entry in the pod file for the table; it also does
17100                     # the children.
17101                     make_re_pod_entries($table) if defined $pod_directory;
17102
17103                     # See if the the table matches identical code points with
17104                     # something that has already been output.  In that case,
17105                     # no need to have two files with the same code points in
17106                     # them.  We use the table's hash() method to store these
17107                     # in buckets, so that it is quite likely that if two
17108                     # tables are in the same bucket they will be identical, so
17109                     # don't have to compare tables frequently.  The tables
17110                     # have to have the same status to share a file, so add
17111                     # this to the bucket hash.  (The reason for this latter is
17112                     # that Heavy.pl associates a status with a file.)
17113                     # We don't check tables that are inverses of others, as it
17114                     # would lead to some coding complications, and checking
17115                     # all the regular ones should find everything.
17116                     if ($table->complement == 0) {
17117                         my $hash = $table->hash . ';' . $table->status;
17118
17119                         # Look at each table that is in the same bucket as
17120                         # this one would be.
17121                         foreach my $comparison
17122                                             (@{$match_tables_to_write{$hash}})
17123                         {
17124                             if ($table->matches_identically_to($comparison)) {
17125                                 $table->set_equivalent_to($comparison,
17126                                                                 Related => 0);
17127                                 next TABLE;
17128                             }
17129                         }
17130
17131                         # Here, not equivalent, add this table to the bucket.
17132                         push @{$match_tables_to_write{$hash}}, $table;
17133                     }
17134                 }
17135             }
17136             else {
17137
17138                 # Here is the property itself.
17139                 # Don't write out or make references to the $perl property
17140                 next if $table == $perl;
17141
17142                 make_ucd_table_pod_entries($table);
17143
17144                 # There is a mapping stored of the various synonyms to the
17145                 # standardized name of the property for utf8_heavy.pl.
17146                 # Also, the pod file contains entries of the form:
17147                 # \p{alias: *}         \p{full: *}
17148                 # rather than show every possible combination of things.
17149
17150                 my @property_aliases = $property->aliases;
17151
17152                 my $full_property_name = $property->full_name;
17153                 my $property_name = $property->name;
17154                 my $standard_property_name = standardize($property_name);
17155                 my $standard_property_full_name
17156                                         = standardize($full_property_name);
17157
17158                 # We also create for Unicode::UCD a list of aliases for
17159                 # the property.  The list starts with the property name;
17160                 # then its full name.  Legacy properties are not listed in
17161                 # Unicode::UCD.
17162                 my @property_list;
17163                 my @standard_list;
17164                 if ( $property->fate <= $MAP_PROXIED) {
17165                     @property_list = ($property_name, $full_property_name);
17166                     @standard_list = ($standard_property_name,
17167                                         $standard_property_full_name);
17168                 }
17169
17170                 # For each synonym ...
17171                 for my $i (0 .. @property_aliases - 1)  {
17172                     my $alias = $property_aliases[$i];
17173                     my $alias_name = $alias->name;
17174                     my $alias_standard = standardize($alias_name);
17175
17176
17177                     # Add other aliases to the list of property aliases
17178                     if ($property->fate <= $MAP_PROXIED
17179                         && ! grep { $alias_standard eq $_ } @standard_list)
17180                     {
17181                         push @property_list, $alias_name;
17182                         push @standard_list, $alias_standard;
17183                     }
17184
17185                     # For utf8_heavy, set the mapping of the alias to the
17186                     # property
17187                     if ($type == $STRING) {
17188                         if ($property->fate <= $MAP_PROXIED) {
17189                             $string_property_loose_to_name{$alias_standard}
17190                                             = $standard_property_name;
17191                         }
17192                     }
17193                     else {
17194                         if (exists ($loose_property_name_of{$alias_standard}))
17195                         {
17196                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
17197                         }
17198                         else {
17199                             $loose_property_name_of{$alias_standard}
17200                                                 = $standard_property_name;
17201                         }
17202
17203                         # Now for the re pod entry for this alias.  Skip if not
17204                         # outputting a pod; skip the first one, which is the
17205                         # full name so won't have an entry like: '\p{full: *}
17206                         # \p{full: *}', and skip if don't want an entry for
17207                         # this one.
17208                         next if $i == 0
17209                                 || ! defined $pod_directory
17210                                 || ! $alias->make_re_pod_entry;
17211
17212                         my $rhs = "\\p{$full_property_name: *}";
17213                         if ($property != $perl && $table->perl_extension) {
17214                             $rhs .= ' (Perl extension)';
17215                         }
17216                         push @match_properties,
17217                             format_pod_line($indent_info_column,
17218                                         '\p{' . $alias->name . ': *}',
17219                                         $rhs,
17220                                         $alias->status);
17221                     }
17222                 }
17223
17224                 # The list of all possible names is attached to each alias, so
17225                 # lookup is easy
17226                 if (@property_list) {
17227                     push @{$prop_aliases{$standard_list[0]}}, @property_list;
17228                 }
17229
17230                 if ($property->fate <= $MAP_PROXIED) {
17231
17232                     # Similarly, we create for Unicode::UCD a list of
17233                     # property-value aliases.
17234
17235                     my $property_full_name = $property->full_name;
17236
17237                     # Look at each table in the property...
17238                     foreach my $table ($property->tables) {
17239                         my @values_list;
17240                         my $table_full_name = $table->full_name;
17241                         my $standard_table_full_name
17242                                               = standardize($table_full_name);
17243                         my $table_name = $table->name;
17244                         my $standard_table_name = standardize($table_name);
17245
17246                         # The list starts with the table name and its full
17247                         # name.
17248                         push @values_list, $table_name, $table_full_name;
17249
17250                         # We add to the table each unique alias that isn't
17251                         # discouraged from use.
17252                         foreach my $alias ($table->aliases) {
17253                             next if $alias->status
17254                                  && $alias->status eq $DISCOURAGED;
17255                             my $name = $alias->name;
17256                             my $standard = standardize($name);
17257                             next if $standard eq $standard_table_name;
17258                             next if $standard eq $standard_table_full_name;
17259                             push @values_list, $name;
17260                         }
17261
17262                         # Here @values_list is a list of all the aliases for
17263                         # the table.  That is, all the property-values given
17264                         # by this table.  By agreement with Unicode::UCD,
17265                         # if the name and full name are identical, and there
17266                         # are no other names, drop the duplcate entry to save
17267                         # memory.
17268                         if (@values_list == 2
17269                             && $values_list[0] eq $values_list[1])
17270                         {
17271                             pop @values_list
17272                         }
17273
17274                         # To save memory, unlike the similar list for property
17275                         # aliases above, only the standard forms hve the list.
17276                         # This forces an extra step of converting from input
17277                         # name to standard name, but the savings are
17278                         # considerable.  (There is only marginal savings if we
17279                         # did this with the property aliases.)
17280                         push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
17281                     }
17282                 }
17283
17284                 # Don't write out a mapping file if not desired.
17285                 next if ! $property->to_output_map;
17286             }
17287
17288             # Here, we know we want to write out the table, but don't do it
17289             # yet because there may be other tables that come along and will
17290             # want to share the file, and the file's comments will change to
17291             # mention them.  So save for later.
17292             push @writables, $table;
17293
17294         } # End of looping through the property and all its tables.
17295     } # End of looping through all properties.
17296
17297     # Now have all the tables that will have files written for them.  Do it.
17298     foreach my $table (@writables) {
17299         my @directory;
17300         my $filename;
17301         my $property = $table->property;
17302         my $is_property = ($table == $property);
17303         if (! $is_property) {
17304
17305             # Match tables for the property go in lib/$subdirectory, which is
17306             # the property's name.  Don't use the standard file name for this,
17307             # as may get an unfamiliar alias
17308             @directory = ($matches_directory, $property->external_name);
17309         }
17310         else {
17311
17312             @directory = $table->directory;
17313             $filename = $table->file;
17314         }
17315
17316         # Use specified filename if available, or default to property's
17317         # shortest name.  We need an 8.3 safe filename (which means "an 8
17318         # safe" filename, since after the dot is only 'pl', which is < 3)
17319         # The 2nd parameter is if the filename shouldn't be changed, and
17320         # it shouldn't iff there is a hard-coded name for this table.
17321         $filename = construct_filename(
17322                                 $filename || $table->external_name,
17323                                 ! $filename,    # mutable if no filename
17324                                 \@directory);
17325
17326         register_file_for_name($table, \@directory, $filename);
17327
17328         # Only need to write one file when shared by more than one
17329         # property
17330         next if ! $is_property
17331                 && ($table->leader != $table || $table->complement != 0);
17332
17333         # Construct a nice comment to add to the file
17334         $table->set_final_comment;
17335
17336         $table->write;
17337     }
17338
17339
17340     # Write out the pod file
17341     make_pod;
17342
17343     # And Heavy.pl, Name.pm, UCD.pl
17344     make_Heavy;
17345     make_Name_pm;
17346     make_UCD;
17347
17348     make_property_test_script() if $make_test_script;
17349     make_normalization_test_script() if $make_norm_test_script;
17350     return;
17351 }
17352
17353 my @white_space_separators = ( # This used only for making the test script.
17354                             "",
17355                             ' ',
17356                             "\t",
17357                             '   '
17358                         );
17359
17360 sub generate_separator($) {
17361     # This used only for making the test script.  It generates the colon or
17362     # equal separator between the property and property value, with random
17363     # white space surrounding the separator
17364
17365     my $lhs = shift;
17366
17367     return "" if $lhs eq "";  # No separator if there's only one (the r) side
17368
17369     # Choose space before and after randomly
17370     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
17371     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
17372
17373     # And return the whole complex, half the time using a colon, half the
17374     # equals
17375     return $spaces_before
17376             . (rand() < 0.5) ? '=' : ':'
17377             . $spaces_after;
17378 }
17379
17380 sub generate_tests($$$$$) {
17381     # This used only for making the test script.  It generates test cases that
17382     # are expected to compile successfully in perl.  Note that the lhs and
17383     # rhs are assumed to already be as randomized as the caller wants.
17384
17385     my $lhs = shift;           # The property: what's to the left of the colon
17386                                #  or equals separator
17387     my $rhs = shift;           # The property value; what's to the right
17388     my $valid_code = shift;    # A code point that's known to be in the
17389                                # table given by lhs=rhs; undef if table is
17390                                # empty
17391     my $invalid_code = shift;  # A code point known to not be in the table;
17392                                # undef if the table is all code points
17393     my $warning = shift;
17394
17395     # Get the colon or equal
17396     my $separator = generate_separator($lhs);
17397
17398     # The whole 'property=value'
17399     my $name = "$lhs$separator$rhs";
17400
17401     my @output;
17402     # Create a complete set of tests, with complements.
17403     if (defined $valid_code) {
17404         push @output, <<"EOC"
17405 Expect(1, $valid_code, '\\p{$name}', $warning);
17406 Expect(0, $valid_code, '\\p{^$name}', $warning);
17407 Expect(0, $valid_code, '\\P{$name}', $warning);
17408 Expect(1, $valid_code, '\\P{^$name}', $warning);
17409 EOC
17410     }
17411     if (defined $invalid_code) {
17412         push @output, <<"EOC"
17413 Expect(0, $invalid_code, '\\p{$name}', $warning);
17414 Expect(1, $invalid_code, '\\p{^$name}', $warning);
17415 Expect(1, $invalid_code, '\\P{$name}', $warning);
17416 Expect(0, $invalid_code, '\\P{^$name}', $warning);
17417 EOC
17418     }
17419     return @output;
17420 }
17421
17422 sub generate_error($$$) {
17423     # This used only for making the test script.  It generates test cases that
17424     # are expected to not only not match, but to be syntax or similar errors
17425
17426     my $lhs = shift;                # The property: what's to the left of the
17427                                     # colon or equals separator
17428     my $rhs = shift;                # The property value; what's to the right
17429     my $already_in_error = shift;   # Boolean; if true it's known that the
17430                                 # unmodified lhs and rhs will cause an error.
17431                                 # This routine should not force another one
17432     # Get the colon or equal
17433     my $separator = generate_separator($lhs);
17434
17435     # Since this is an error only, don't bother to randomly decide whether to
17436     # put the error on the left or right side; and assume that the rhs is
17437     # loosely matched, again for convenience rather than rigor.
17438     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
17439
17440     my $property = $lhs . $separator . $rhs;
17441
17442     return <<"EOC";
17443 Error('\\p{$property}');
17444 Error('\\P{$property}');
17445 EOC
17446 }
17447
17448 # These are used only for making the test script
17449 # XXX Maybe should also have a bad strict seps, which includes underscore.
17450
17451 my @good_loose_seps = (
17452             " ",
17453             "-",
17454             "\t",
17455             "",
17456             "_",
17457            );
17458 my @bad_loose_seps = (
17459            "/a/",
17460            ':=',
17461           );
17462
17463 sub randomize_stricter_name {
17464     # This used only for making the test script.  Take the input name and
17465     # return a randomized, but valid version of it under the stricter matching
17466     # rules.
17467
17468     my $name = shift;
17469     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
17470
17471     # If the name looks like a number (integer, floating, or rational), do
17472     # some extra work
17473     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
17474         my $sign = $1;
17475         my $number = $2;
17476         my $separator = $3;
17477
17478         # If there isn't a sign, part of the time add a plus
17479         # Note: Not testing having any denominator having a minus sign
17480         if (! $sign) {
17481             $sign = '+' if rand() <= .3;
17482         }
17483
17484         # And add 0 or more leading zeros.
17485         $name = $sign . ('0' x int rand(10)) . $number;
17486
17487         if (defined $separator) {
17488             my $extra_zeros = '0' x int rand(10);
17489
17490             if ($separator eq '.') {
17491
17492                 # Similarly, add 0 or more trailing zeros after a decimal
17493                 # point
17494                 $name .= $extra_zeros;
17495             }
17496             else {
17497
17498                 # Or, leading zeros before the denominator
17499                 $name =~ s,/,/$extra_zeros,;
17500             }
17501         }
17502     }
17503
17504     # For legibility of the test, only change the case of whole sections at a
17505     # time.  To do this, first split into sections.  The split returns the
17506     # delimiters
17507     my @sections;
17508     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
17509         trace $section if main::DEBUG && $to_trace;
17510
17511         if (length $section > 1 && $section !~ /\D/) {
17512
17513             # If the section is a sequence of digits, about half the time
17514             # randomly add underscores between some of them.
17515             if (rand() > .5) {
17516
17517                 # Figure out how many underscores to add.  max is 1 less than
17518                 # the number of digits.  (But add 1 at the end to make sure
17519                 # result isn't 0, and compensate earlier by subtracting 2
17520                 # instead of 1)
17521                 my $num_underscores = int rand(length($section) - 2) + 1;
17522
17523                 # And add them evenly throughout, for convenience, not rigor
17524                 use integer;
17525                 my $spacing = (length($section) - 1)/ $num_underscores;
17526                 my $temp = $section;
17527                 $section = "";
17528                 for my $i (1 .. $num_underscores) {
17529                     $section .= substr($temp, 0, $spacing, "") . '_';
17530                 }
17531                 $section .= $temp;
17532             }
17533             push @sections, $section;
17534         }
17535         else {
17536
17537             # Here not a sequence of digits.  Change the case of the section
17538             # randomly
17539             my $switch = int rand(4);
17540             if ($switch == 0) {
17541                 push @sections, uc $section;
17542             }
17543             elsif ($switch == 1) {
17544                 push @sections, lc $section;
17545             }
17546             elsif ($switch == 2) {
17547                 push @sections, ucfirst $section;
17548             }
17549             else {
17550                 push @sections, $section;
17551             }
17552         }
17553     }
17554     trace "returning", join "", @sections if main::DEBUG && $to_trace;
17555     return join "", @sections;
17556 }
17557
17558 sub randomize_loose_name($;$) {
17559     # This used only for making the test script
17560
17561     my $name = shift;
17562     my $want_error = shift;  # if true, make an error
17563     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
17564
17565     $name = randomize_stricter_name($name);
17566
17567     my @parts;
17568     push @parts, $good_loose_seps[rand(@good_loose_seps)];
17569
17570     # Preserve trailing ones for the sake of not stripping the underscore from
17571     # 'L_'
17572     for my $part (split /[-\s_]+ (?= . )/, $name) {
17573         if (@parts) {
17574             if ($want_error and rand() < 0.3) {
17575                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
17576                 $want_error = 0;
17577             }
17578             else {
17579                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
17580             }
17581         }
17582         push @parts, $part;
17583     }
17584     my $new = join("", @parts);
17585     trace "$name => $new" if main::DEBUG && $to_trace;
17586
17587     if ($want_error) {
17588         if (rand() >= 0.5) {
17589             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
17590         }
17591         else {
17592             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
17593         }
17594     }
17595     return $new;
17596 }
17597
17598 # Used to make sure don't generate duplicate test cases.
17599 my %test_generated;
17600
17601 sub make_property_test_script() {
17602     # This used only for making the test script
17603     # this written directly -- it's huge.
17604
17605     print "Making test script\n" if $verbosity >= $PROGRESS;
17606
17607     # This uses randomness to test different possibilities without testing all
17608     # possibilities.  To ensure repeatability, set the seed to 0.  But if
17609     # tests are added, it will perturb all later ones in the .t file
17610     srand 0;
17611
17612     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
17613
17614     # Keep going down an order of magnitude
17615     # until find that adding this quantity to
17616     # 1 remains 1; but put an upper limit on
17617     # this so in case this algorithm doesn't
17618     # work properly on some platform, that we
17619     # won't loop forever.
17620     my $digits = 0;
17621     my $min_floating_slop = 1;
17622     while (1+ $min_floating_slop != 1
17623             && $digits++ < 50)
17624     {
17625         my $next = $min_floating_slop / 10;
17626         last if $next == 0; # If underflows,
17627                             # use previous one
17628         $min_floating_slop = $next;
17629     }
17630
17631     # It doesn't matter whether the elements of this array contain single lines
17632     # or multiple lines. main::write doesn't count the lines.
17633     my @output;
17634
17635     # Sort these so get results in same order on different runs of this
17636     # program
17637     foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) {
17638         foreach my $table (sort { $a->name cmp $b->name } $property->tables) {
17639
17640             # Find code points that match, and don't match this table.
17641             my $valid = $table->get_valid_code_point;
17642             my $invalid = $table->get_invalid_code_point;
17643             my $warning = ($table->status eq $DEPRECATED)
17644                             ? "'deprecated'"
17645                             : '""';
17646
17647             # Test each possible combination of the property's aliases with
17648             # the table's.  If this gets to be too many, could do what is done
17649             # in the set_final_comment() for Tables
17650             my @table_aliases = $table->aliases;
17651             my @property_aliases = $table->property->aliases;
17652
17653             # Every property can be optionally be prefixed by 'Is_', so test
17654             # that those work, by creating such a new alias for each
17655             # pre-existing one.
17656             push @property_aliases, map { Alias->new("Is_" . $_->name,
17657                                                     $_->loose_match,
17658                                                     $_->make_re_pod_entry,
17659                                                     $_->ok_as_filename,
17660                                                     $_->status,
17661                                                     $_->ucd,
17662                                                     )
17663                                          } @property_aliases;
17664             my $max = max(scalar @table_aliases, scalar @property_aliases);
17665             for my $j (0 .. $max - 1) {
17666
17667                 # The current alias for property is the next one on the list,
17668                 # or if beyond the end, start over.  Similarly for table
17669                 my $property_name
17670                             = $property_aliases[$j % @property_aliases]->name;
17671
17672                 $property_name = "" if $table->property == $perl;
17673                 my $table_alias = $table_aliases[$j % @table_aliases];
17674                 my $table_name = $table_alias->name;
17675                 my $loose_match = $table_alias->loose_match;
17676
17677                 # If the table doesn't have a file, any test for it is
17678                 # already guaranteed to be in error
17679                 my $already_error = ! $table->file_path;
17680
17681                 # Generate error cases for this alias.
17682                 push @output, generate_error($property_name,
17683                                              $table_name,
17684                                              $already_error);
17685
17686                 # If the table is guaranteed to always generate an error,
17687                 # quit now without generating success cases.
17688                 next if $already_error;
17689
17690                 # Now for the success cases.
17691                 my $random;
17692                 if ($loose_match) {
17693
17694                     # For loose matching, create an extra test case for the
17695                     # standard name.
17696                     my $standard = standardize($table_name);
17697
17698                     # $test_name should be a unique combination for each test
17699                     # case; used just to avoid duplicate tests
17700                     my $test_name = "$property_name=$standard";
17701
17702                     # Don't output duplicate test cases.
17703                     if (! exists $test_generated{$test_name}) {
17704                         $test_generated{$test_name} = 1;
17705                         push @output, generate_tests($property_name,
17706                                                      $standard,
17707                                                      $valid,
17708                                                      $invalid,
17709                                                      $warning,
17710                                                  );
17711                     }
17712                     $random = randomize_loose_name($table_name)
17713                 }
17714                 else { # Stricter match
17715                     $random = randomize_stricter_name($table_name);
17716                 }
17717
17718                 # Now for the main test case for this alias.
17719                 my $test_name = "$property_name=$random";
17720                 if (! exists $test_generated{$test_name}) {
17721                     $test_generated{$test_name} = 1;
17722                     push @output, generate_tests($property_name,
17723                                                  $random,
17724                                                  $valid,
17725                                                  $invalid,
17726                                                  $warning,
17727                                              );
17728
17729                     # If the name is a rational number, add tests for the
17730                     # floating point equivalent.
17731                     if ($table_name =~ qr{/}) {
17732
17733                         # Calculate the float, and find just the fraction.
17734                         my $float = eval $table_name;
17735                         my ($whole, $fraction)
17736                                             = $float =~ / (.*) \. (.*) /x;
17737
17738                         # Starting with one digit after the decimal point,
17739                         # create a test for each possible precision (number of
17740                         # digits past the decimal point) until well beyond the
17741                         # native number found on this machine.  (If we started
17742                         # with 0 digits, it would be an integer, which could
17743                         # well match an unrelated table)
17744                         PLACE:
17745                         for my $i (1 .. $min_floating_slop + 3) {
17746                             my $table_name = sprintf("%.*f", $i, $float);
17747                             if ($i < $MIN_FRACTION_LENGTH) {
17748
17749                                 # If the test case has fewer digits than the
17750                                 # minimum acceptable precision, it shouldn't
17751                                 # succeed, so we expect an error for it.
17752                                 # E.g., 2/3 = .7 at one decimal point, and we
17753                                 # shouldn't say it matches .7.  We should make
17754                                 # it be .667 at least before agreeing that the
17755                                 # intent was to match 2/3.  But at the
17756                                 # less-than- acceptable level of precision, it
17757                                 # might actually match an unrelated number.
17758                                 # So don't generate a test case if this
17759                                 # conflating is possible.  In our example, we
17760                                 # don't want 2/3 matching 7/10, if there is
17761                                 # a 7/10 code point.
17762                                 for my $existing
17763                                         (keys %nv_floating_to_rational)
17764                                 {
17765                                     next PLACE
17766                                         if abs($table_name - $existing)
17767                                                 < $MAX_FLOATING_SLOP;
17768                                 }
17769                                 push @output, generate_error($property_name,
17770                                                              $table_name,
17771                                                              1   # 1 => already an error
17772                                               );
17773                             }
17774                             else {
17775
17776                                 # Here the number of digits exceeds the
17777                                 # minimum we think is needed.  So generate a
17778                                 # success test case for it.
17779                                 push @output, generate_tests($property_name,
17780                                                              $table_name,
17781                                                              $valid,
17782                                                              $invalid,
17783                                                              $warning,
17784                                              );
17785                             }
17786                         }
17787                     }
17788                 }
17789             }
17790         }
17791     }
17792
17793     &write($t_path,
17794            0,           # Not utf8;
17795            [<DATA>,
17796             @output,
17797             (map {"Test_X('$_');\n"} @backslash_X_tests),
17798             "Finished();\n"]);
17799     return;
17800 }
17801
17802 sub make_normalization_test_script() {
17803     print "Making normalization test script\n" if $verbosity >= $PROGRESS;
17804
17805     my $n_path = 'TestNorm.pl';
17806
17807     unshift @normalization_tests, <<'END';
17808 use utf8;
17809 use Test::More;
17810
17811 sub ord_string {    # Convert packed ords to printable string
17812     use charnames ();
17813     return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
17814                                                 unpack "U*", shift) .  "'";
17815     #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
17816 }
17817
17818 sub Test_N {
17819     my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
17820     my $display_source = ord_string($source);
17821     my $display_nfc = ord_string($nfc);
17822     my $display_nfd = ord_string($nfd);
17823     my $display_nfkc = ord_string($nfkc);
17824     my $display_nfkd = ord_string($nfkd);
17825
17826     use Unicode::Normalize;
17827     #    NFC
17828     #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
17829     #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
17830     #
17831     #    NFD
17832     #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
17833     #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
17834     #
17835     #    NFKC
17836     #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
17837     #      toNFKC(nfkc) == toNFKC(nfkd)
17838     #
17839     #    NFKD
17840     #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
17841     #      toNFKD(nfkc) == toNFKD(nfkd)
17842
17843     is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
17844     is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
17845     is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
17846     is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
17847     is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
17848
17849     is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
17850     is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
17851     is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
17852     is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
17853     is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
17854
17855     is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
17856     is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
17857     is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
17858     is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
17859     is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
17860
17861     is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
17862     is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
17863     is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
17864     is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
17865     is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
17866 }
17867 END
17868
17869     &write($n_path,
17870            1,           # Is utf8;
17871            [
17872             @normalization_tests,
17873             'done_testing();'
17874             ]);
17875     return;
17876 }
17877
17878 # This is a list of the input files and how to handle them.  The files are
17879 # processed in their order in this list.  Some reordering is possible if
17880 # desired, but the v0 files should be first, and the extracted before the
17881 # others except DAge.txt (as data in an extracted file can be over-ridden by
17882 # the non-extracted.  Some other files depend on data derived from an earlier
17883 # file, like UnicodeData requires data from Jamo, and the case changing and
17884 # folding requires data from Unicode.  Mostly, it is safest to order by first
17885 # version releases in (except the Jamo).  DAge.txt is read before the
17886 # extracted ones because of the rarely used feature $compare_versions.  In the
17887 # unlikely event that there were ever an extracted file that contained the Age
17888 # property information, it would have to go in front of DAge.
17889 #
17890 # The version strings allow the program to know whether to expect a file or
17891 # not, but if a file exists in the directory, it will be processed, even if it
17892 # is in a version earlier than expected, so you can copy files from a later
17893 # release into an earlier release's directory.
17894 my @input_file_objects = (
17895     Input_file->new('PropertyAliases.txt', v0,
17896                     Handler => \&process_PropertyAliases,
17897                     ),
17898     Input_file->new(undef, v0,  # No file associated with this
17899                     Progress_Message => 'Finishing property setup',
17900                     Handler => \&finish_property_setup,
17901                     ),
17902     Input_file->new('PropValueAliases.txt', v0,
17903                      Handler => \&process_PropValueAliases,
17904                      Has_Missings_Defaults => $NOT_IGNORED,
17905                      ),
17906     Input_file->new('DAge.txt', v3.2.0,
17907                     Has_Missings_Defaults => $NOT_IGNORED,
17908                     Property => 'Age'
17909                     ),
17910     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
17911                     Property => 'General_Category',
17912                     ),
17913     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
17914                     Property => 'Canonical_Combining_Class',
17915                     Has_Missings_Defaults => $NOT_IGNORED,
17916                     ),
17917     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
17918                     Property => 'Numeric_Type',
17919                     Has_Missings_Defaults => $NOT_IGNORED,
17920                     ),
17921     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
17922                     Property => 'East_Asian_Width',
17923                     Has_Missings_Defaults => $NOT_IGNORED,
17924                     ),
17925     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
17926                     Property => 'Line_Break',
17927                     Has_Missings_Defaults => $NOT_IGNORED,
17928                     ),
17929     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
17930                     Property => 'Bidi_Class',
17931                     Has_Missings_Defaults => $NOT_IGNORED,
17932                     ),
17933     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
17934                     Property => 'Decomposition_Type',
17935                     Has_Missings_Defaults => $NOT_IGNORED,
17936                     ),
17937     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
17938     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
17939                     Property => 'Numeric_Value',
17940                     Each_Line_Handler => \&filter_numeric_value_line,
17941                     Has_Missings_Defaults => $NOT_IGNORED,
17942                     ),
17943     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
17944                     Property => 'Joining_Group',
17945                     Has_Missings_Defaults => $NOT_IGNORED,
17946                     ),
17947
17948     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
17949                     Property => 'Joining_Type',
17950                     Has_Missings_Defaults => $NOT_IGNORED,
17951                     ),
17952     Input_file->new('Jamo.txt', v2.0.0,
17953                     Property => 'Jamo_Short_Name',
17954                     Each_Line_Handler => \&filter_jamo_line,
17955                     ),
17956     Input_file->new('UnicodeData.txt', v1.1.5,
17957                     Pre_Handler => \&setup_UnicodeData,
17958
17959                     # We clean up this file for some early versions.
17960                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
17961                                             ? \&filter_v1_ucd
17962                                             : ($v_version eq v2.1.5)
17963                                                 ? \&filter_v2_1_5_ucd
17964
17965                                                 # And for 5.14 Perls with 6.0,
17966                                                 # have to also make changes
17967                                                 : ($v_version ge v6.0.0
17968                                                    && $^V lt v5.17.0)
17969                                                     ? \&filter_v6_ucd
17970                                                     : undef),
17971
17972                                             # Early versions did not have the
17973                                             # proper Unicode_1 names for the
17974                                             # controls
17975                                             (($v_version lt v3.0.0)
17976                                             ? \&filter_early_U1_names
17977                                             : undef),
17978
17979                                             # Early versions did not correctly
17980                                             # use the later method for giving
17981                                             # decimal digit values
17982                                             (($v_version le v3.2.0)
17983                                             ? \&filter_bad_Nd_ucd
17984                                             : undef),
17985
17986                                             # And the main filter
17987                                             \&filter_UnicodeData_line,
17988                                          ],
17989                     EOF_Handler => \&EOF_UnicodeData,
17990                     ),
17991     Input_file->new('ArabicShaping.txt', v2.0.0,
17992                     Each_Line_Handler =>
17993                         ($v_version lt 4.1.0)
17994                                     ? \&filter_old_style_arabic_shaping
17995                                     : undef,
17996                     # The first field after the range is a "schematic name"
17997                     # not used by Perl
17998                     Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
17999                     Has_Missings_Defaults => $NOT_IGNORED,
18000                     ),
18001     Input_file->new('Blocks.txt', v2.0.0,
18002                     Property => 'Block',
18003                     Has_Missings_Defaults => $NOT_IGNORED,
18004                     Each_Line_Handler => \&filter_blocks_lines
18005                     ),
18006     Input_file->new('PropList.txt', v2.0.0,
18007                     Each_Line_Handler => (($v_version lt v3.1.0)
18008                                             ? \&filter_old_style_proplist
18009                                             : undef),
18010                     ),
18011     Input_file->new('Unihan.txt', v2.0.0,
18012                     Pre_Handler => \&setup_unihan,
18013                     Optional => 1,
18014                     Each_Line_Handler => \&filter_unihan_line,
18015                         ),
18016     Input_file->new('SpecialCasing.txt', v2.1.8,
18017                     Each_Line_Handler => ($v_version eq 2.1.8)
18018                                          ? \&filter_2_1_8_special_casing_line
18019                                          : \&filter_special_casing_line,
18020                     Pre_Handler => \&setup_special_casing,
18021                     Has_Missings_Defaults => $IGNORED,
18022                     ),
18023     Input_file->new(
18024                     'LineBreak.txt', v3.0.0,
18025                     Has_Missings_Defaults => $NOT_IGNORED,
18026                     Property => 'Line_Break',
18027                     # Early versions had problematic syntax
18028                     Each_Line_Handler => (($v_version lt v3.1.0)
18029                                         ? \&filter_early_ea_lb
18030                                         : undef),
18031                     ),
18032     Input_file->new('EastAsianWidth.txt', v3.0.0,
18033                     Property => 'East_Asian_Width',
18034                     Has_Missings_Defaults => $NOT_IGNORED,
18035                     # Early versions had problematic syntax
18036                     Each_Line_Handler => (($v_version lt v3.1.0)
18037                                         ? \&filter_early_ea_lb
18038                                         : undef),
18039                     ),
18040     Input_file->new('CompositionExclusions.txt', v3.0.0,
18041                     Property => 'Composition_Exclusion',
18042                     ),
18043     Input_file->new('BidiMirroring.txt', v3.0.1,
18044                     Property => 'Bidi_Mirroring_Glyph',
18045                     Has_Missings_Defaults => ($v_version lt v6.2.0)
18046                                               ? $NO_DEFAULTS
18047                                               # Is <none> which doesn't mean
18048                                               # anything to us, we will use the
18049                                               # null string
18050                                               : $IGNORED,
18051
18052                     ),
18053     Input_file->new("NormTest.txt", v3.0.0,
18054                      Handler => \&process_NormalizationsTest,
18055                      Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
18056                     ),
18057     Input_file->new('CaseFolding.txt', v3.0.1,
18058                     Pre_Handler => \&setup_case_folding,
18059                     Each_Line_Handler =>
18060                         [ ($v_version lt v3.1.0)
18061                                  ? \&filter_old_style_case_folding
18062                                  : undef,
18063                            \&filter_case_folding_line
18064                         ],
18065                     Has_Missings_Defaults => $IGNORED,
18066                     ),
18067     Input_file->new('DCoreProperties.txt', v3.1.0,
18068                     # 5.2 changed this file
18069                     Has_Missings_Defaults => (($v_version ge v5.2.0)
18070                                             ? $NOT_IGNORED
18071                                             : $NO_DEFAULTS),
18072                     ),
18073     Input_file->new('Scripts.txt', v3.1.0,
18074                     Property => 'Script',
18075                     Has_Missings_Defaults => $NOT_IGNORED,
18076                     ),
18077     Input_file->new('DNormalizationProps.txt', v3.1.0,
18078                     Has_Missings_Defaults => $NOT_IGNORED,
18079                     Each_Line_Handler => (($v_version lt v4.0.1)
18080                                       ? \&filter_old_style_normalization_lines
18081                                       : undef),
18082                     ),
18083     Input_file->new('HangulSyllableType.txt', v0,
18084                     Has_Missings_Defaults => $NOT_IGNORED,
18085                     Property => 'Hangul_Syllable_Type',
18086                     Pre_Handler => ($v_version lt v4.0.0)
18087                                    ? \&generate_hst
18088                                    : undef,
18089                     ),
18090     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
18091                     Property => 'Word_Break',
18092                     Has_Missings_Defaults => $NOT_IGNORED,
18093                     ),
18094     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
18095                     Property => 'Grapheme_Cluster_Break',
18096                     Has_Missings_Defaults => $NOT_IGNORED,
18097                     Pre_Handler => ($v_version lt v4.1.0)
18098                                    ? \&generate_GCB
18099                                    : undef,
18100                     ),
18101     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
18102                     Handler => \&process_GCB_test,
18103                     ),
18104     Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
18105                     Skip => 'Validation Tests',
18106                     ),
18107     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
18108                     Skip => 'Validation Tests',
18109                     ),
18110     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
18111                     Skip => 'Validation Tests',
18112                     ),
18113     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
18114                     Property => 'Sentence_Break',
18115                     Has_Missings_Defaults => $NOT_IGNORED,
18116                     ),
18117     Input_file->new('NamedSequences.txt', v4.1.0,
18118                     Handler => \&process_NamedSequences
18119                     ),
18120     Input_file->new('NameAliases.txt', v0,
18121                     Property => 'Name_Alias',
18122                     Pre_Handler => ($v_version le v6.0.0)
18123                                    ? \&setup_early_name_alias
18124                                    : undef,
18125                     Each_Line_Handler => ($v_version le v6.0.0)
18126                                    ? \&filter_early_version_name_alias_line
18127                                    : \&filter_later_version_name_alias_line,
18128                     ),
18129     Input_file->new("BidiTest.txt", v5.2.0,
18130                     Skip => 'Validation Tests',
18131                     ),
18132     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
18133                     Optional => 1,
18134                     Each_Line_Handler => \&filter_unihan_line,
18135                     ),
18136     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
18137                     Optional => 1,
18138                     Each_Line_Handler => \&filter_unihan_line,
18139                     ),
18140     Input_file->new('UnihanIRGSources.txt', v5.2.0,
18141                     Optional => 1,
18142                     Pre_Handler => \&setup_unihan,
18143                     Each_Line_Handler => \&filter_unihan_line,
18144                     ),
18145     Input_file->new('UnihanNumericValues.txt', v5.2.0,
18146                     Optional => 1,
18147                     Each_Line_Handler => \&filter_unihan_line,
18148                     ),
18149     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
18150                     Optional => 1,
18151                     Each_Line_Handler => \&filter_unihan_line,
18152                     ),
18153     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
18154                     Optional => 1,
18155                     Each_Line_Handler => \&filter_unihan_line,
18156                     ),
18157     Input_file->new('UnihanReadings.txt', v5.2.0,
18158                     Optional => 1,
18159                     Each_Line_Handler => \&filter_unihan_line,
18160                     ),
18161     Input_file->new('UnihanVariants.txt', v5.2.0,
18162                     Optional => 1,
18163                     Each_Line_Handler => \&filter_unihan_line,
18164                     ),
18165     Input_file->new('ScriptExtensions.txt', v6.0.0,
18166                     Property => 'Script_Extensions',
18167                     Pre_Handler => \&setup_script_extensions,
18168                     Each_Line_Handler => \&filter_script_extensions_line,
18169                     Has_Missings_Defaults => (($v_version le v6.0.0)
18170                                             ? $NO_DEFAULTS
18171                                             : $IGNORED),
18172                     ),
18173     # The two Indic files are actually available starting in v6.0.0, but their
18174     # property values are missing from PropValueAliases.txt in that release,
18175     # so that further work would have to be done to get them to work properly
18176     # for that release.
18177     Input_file->new('IndicMatraCategory.txt', v6.1.0,
18178                     Property => 'Indic_Matra_Category',
18179                     Has_Missings_Defaults => $NOT_IGNORED,
18180                     Skip => "Provisional; for the analysis and processing of Indic scripts",
18181                     ),
18182     Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
18183                     Property => 'Indic_Syllabic_Category',
18184                     Has_Missings_Defaults => $NOT_IGNORED,
18185                     Skip => "Provisional; for the analysis and processing of Indic scripts",
18186                     ),
18187     Input_file->new('BidiBrackets.txt', v6.3.0,
18188                     Properties => [ 'Bidi_Paired_Bracket', 'Bidi_Paired_Bracket_Type' ],
18189                     Has_Missings_Defaults => $NO_DEFAULTS,
18190                     ),
18191     Input_file->new("BidiCharacterTest.txt", v6.3.0,
18192                     Skip => 'Validation Tests',
18193                     ),
18194 );
18195
18196 # End of all the preliminaries.
18197 # Do it...
18198
18199 if ($compare_versions) {
18200     Carp::my_carp(<<END
18201 Warning.  \$compare_versions is set.  Output is not suitable for production
18202 END
18203     );
18204 }
18205
18206 # Put into %potential_files a list of all the files in the directory structure
18207 # that could be inputs to this program, excluding those that we should ignore.
18208 # Use absolute file names because it makes it easier across machine types.
18209 my @ignored_files_full_names = map { File::Spec->rel2abs(
18210                                      internal_file_to_platform($_))
18211                                 } keys %ignored_files;
18212 File::Find::find({
18213     wanted=>sub {
18214         return unless /\.txt$/i;  # Some platforms change the name's case
18215         my $full = lc(File::Spec->rel2abs($_));
18216         $potential_files{$full} = 1
18217                     if ! grep { $full eq lc($_) } @ignored_files_full_names;
18218         return;
18219     }
18220 }, File::Spec->curdir());
18221
18222 my @mktables_list_output_files;
18223 my $old_start_time = 0;
18224 my $old_options = "";
18225
18226 if (! -e $file_list) {
18227     print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
18228     $write_unchanged_files = 1;
18229 } elsif ($write_unchanged_files) {
18230     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
18231 }
18232 else {
18233     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
18234     my $file_handle;
18235     if (! open $file_handle, "<", $file_list) {
18236         Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
18237         $glob_list = 1;
18238     }
18239     else {
18240         my @input;
18241
18242         # Read and parse mktables.lst, placing the results from the first part
18243         # into @input, and the second part into @mktables_list_output_files
18244         for my $list ( \@input, \@mktables_list_output_files ) {
18245             while (<$file_handle>) {
18246                 s/^ \s+ | \s+ $//xg;
18247                 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
18248                     $old_start_time = $1;
18249                     next;
18250                 }
18251                 if (/^ \s* \# \s* From\ options\ (.+) /x) {
18252                     $old_options = $1;
18253                     next;
18254                 }
18255                 next if /^ \s* (?: \# .* )? $/x;
18256                 last if /^ =+ $/x;
18257                 my ( $file ) = split /\t/;
18258                 push @$list, $file;
18259             }
18260             @$list = uniques(@$list);
18261             next;
18262         }
18263
18264         # Look through all the input files
18265         foreach my $input (@input) {
18266             next if $input eq 'version'; # Already have checked this.
18267
18268             # Ignore if doesn't exist.  The checking about whether we care or
18269             # not is done via the Input_file object.
18270             next if ! file_exists($input);
18271
18272             # The paths are stored with relative names, and with '/' as the
18273             # delimiter; convert to absolute on this machine
18274             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
18275             $potential_files{lc $full} = 1
18276                 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
18277         }
18278     }
18279
18280     close $file_handle;
18281 }
18282
18283 if ($glob_list) {
18284
18285     # Here wants to process all .txt files in the directory structure.
18286     # Convert them to full path names.  They are stored in the platform's
18287     # relative style
18288     my @known_files;
18289     foreach my $object (@input_file_objects) {
18290         my $file = $object->file;
18291         next unless defined $file;
18292         push @known_files, File::Spec->rel2abs($file);
18293     }
18294
18295     my @unknown_input_files;
18296     foreach my $file (keys %potential_files) {  # The keys are stored in lc
18297         next if grep { $file eq lc($_) } @known_files;
18298
18299         # Here, the file is unknown to us.  Get relative path name
18300         $file = File::Spec->abs2rel($file);
18301         push @unknown_input_files, $file;
18302
18303         # What will happen is we create a data structure for it, and add it to
18304         # the list of input files to process.  First get the subdirectories
18305         # into an array
18306         my (undef, $directories, undef) = File::Spec->splitpath($file);
18307         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
18308         my @directories = File::Spec->splitdir($directories);
18309
18310         # If the file isn't extracted (meaning none of the directories is the
18311         # extracted one), just add it to the end of the list of inputs.
18312         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
18313             push @input_file_objects, Input_file->new($file, v0);
18314         }
18315         else {
18316
18317             # Here, the file is extracted.  It needs to go ahead of most other
18318             # processing.  Search for the first input file that isn't a
18319             # special required property (that is, find one whose first_release
18320             # is non-0), and isn't extracted.  Also, the Age property file is
18321             # processed before the extracted ones, just in case
18322             # $compare_versions is set.
18323             for (my $i = 0; $i < @input_file_objects; $i++) {
18324                 if ($input_file_objects[$i]->first_released ne v0
18325                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
18326                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
18327                 {
18328                     splice @input_file_objects, $i, 0,
18329                                                 Input_file->new($file, v0);
18330                     last;
18331                 }
18332             }
18333
18334         }
18335     }
18336     if (@unknown_input_files) {
18337         print STDERR simple_fold(join_lines(<<END
18338
18339 The following files are unknown as to how to handle.  Assuming they are
18340 typical property files.  You'll know by later error messages if it worked or
18341 not:
18342 END
18343         ) . " " . join(", ", @unknown_input_files) . "\n\n");
18344     }
18345 } # End of looking through directory structure for more .txt files.
18346
18347 # Create the list of input files from the objects we have defined, plus
18348 # version
18349 my @input_files = qw(version Makefile);
18350 foreach my $object (@input_file_objects) {
18351     my $file = $object->file;
18352     next if ! defined $file;    # Not all objects have files
18353     next if $object->optional && ! -e $file;
18354     push @input_files,  $file;
18355 }
18356
18357 if ( $verbosity >= $VERBOSE ) {
18358     print "Expecting ".scalar( @input_files )." input files. ",
18359          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
18360 }
18361
18362 # We set $most_recent to be the most recently changed input file, including
18363 # this program itself (done much earlier in this file)
18364 foreach my $in (@input_files) {
18365     next unless -e $in;        # Keep going even if missing a file
18366     my $mod_time = (stat $in)[9];
18367     $most_recent = $mod_time if $mod_time > $most_recent;
18368
18369     # See that the input files have distinct names, to warn someone if they
18370     # are adding a new one
18371     if ($make_list) {
18372         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
18373         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
18374         my @directories = File::Spec->splitdir($directories);
18375         my $base = $file =~ s/\.txt$//;
18376         construct_filename($file, 'mutable', \@directories);
18377     }
18378 }
18379
18380 # We use 'Makefile' just to see if it has changed since the last time we
18381 # rebuilt.  Now discard it.
18382 @input_files = grep { $_ ne 'Makefile' } @input_files;
18383
18384 my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
18385               || ! scalar @mktables_list_output_files  # or if no outputs known
18386               || $old_start_time < $most_recent        # or out-of-date
18387               || $old_options ne $command_line_arguments; # or with different
18388                                                           # options
18389
18390 # Now we check to see if any output files are older than youngest, if
18391 # they are, we need to continue on, otherwise we can presumably bail.
18392 if (! $rebuild) {
18393     foreach my $out (@mktables_list_output_files) {
18394         if ( ! file_exists($out)) {
18395             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
18396             $rebuild = 1;
18397             last;
18398          }
18399         #local $to_trace = 1 if main::DEBUG;
18400         trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
18401         if ( (stat $out)[9] <= $most_recent ) {
18402             #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
18403             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
18404             $rebuild = 1;
18405             last;
18406         }
18407     }
18408 }
18409 if (! $rebuild) {
18410     print "Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
18411     exit(0);
18412 }
18413 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
18414
18415 # Ready to do the major processing.  First create the perl pseudo-property.
18416 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
18417
18418 # Process each input file
18419 foreach my $file (@input_file_objects) {
18420     $file->run;
18421 }
18422
18423 # Finish the table generation.
18424
18425 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
18426 finish_Unicode();
18427
18428 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
18429 compile_perl();
18430
18431 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
18432 add_perl_synonyms();
18433
18434 print "Writing tables\n" if $verbosity >= $PROGRESS;
18435 write_all_tables();
18436
18437 # Write mktables.lst
18438 if ( $file_list and $make_list ) {
18439
18440     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
18441     foreach my $file (@input_files, @files_actually_output) {
18442         my (undef, $directories, $file) = File::Spec->splitpath($file);
18443         my @directories = File::Spec->splitdir($directories);
18444         $file = join '/', @directories, $file;
18445     }
18446
18447     my $ofh;
18448     if (! open $ofh,">",$file_list) {
18449         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
18450         return
18451     }
18452     else {
18453         my $localtime = localtime $start_time;
18454         print $ofh <<"END";
18455 #
18456 # $file_list -- File list for $0.
18457 #
18458 #   Autogenerated starting on $start_time ($localtime)
18459 #   From options $command_line_arguments
18460 #
18461 # - First section is input files
18462 #   ($0 itself is not listed but is automatically considered an input)
18463 # - Section separator is /^=+\$/
18464 # - Second section is a list of output files.
18465 # - Lines matching /^\\s*#/ are treated as comments
18466 #   which along with blank lines are ignored.
18467 #
18468
18469 # Input files:
18470
18471 END
18472         print $ofh "$_\n" for sort(@input_files);
18473         print $ofh "\n=================================\n# Output files:\n\n";
18474         print $ofh "$_\n" for sort @files_actually_output;
18475         print $ofh "\n# ",scalar(@input_files)," input files\n",
18476                 "# ",scalar(@files_actually_output)+1," output files\n\n",
18477                 "# End list\n";
18478         close $ofh
18479             or Carp::my_carp("Failed to close $ofh: $!");
18480
18481         print "Filelist has ",scalar(@input_files)," input files and ",
18482             scalar(@files_actually_output)+1," output files\n"
18483             if $verbosity >= $VERBOSE;
18484     }
18485 }
18486
18487 # Output these warnings unless -q explicitly specified.
18488 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
18489     if (@unhandled_properties) {
18490         print "\nProperties and tables that unexpectedly have no code points\n";
18491         foreach my $property (sort @unhandled_properties) {
18492             print $property, "\n";
18493         }
18494     }
18495
18496     if (%potential_files) {
18497         print "\nInput files that are not considered:\n";
18498         foreach my $file (sort keys %potential_files) {
18499             print File::Spec->abs2rel($file), "\n";
18500         }
18501     }
18502     print "\nAll done\n" if $verbosity >= $VERBOSE;
18503 }
18504 exit(0);
18505
18506 # TRAILING CODE IS USED BY make_property_test_script()
18507 __DATA__
18508
18509 use strict;
18510 use warnings;
18511
18512 # If run outside the normal test suite on an ASCII platform, you can
18513 # just create a latin1_to_native() function that just returns its
18514 # inputs, because that's the only function used from test.pl
18515 require "test.pl";
18516
18517 # Test qr/\X/ and the \p{} regular expression constructs.  This file is
18518 # constructed by mktables from the tables it generates, so if mktables is
18519 # buggy, this won't necessarily catch those bugs.  Tests are generated for all
18520 # feasible properties; a few aren't currently feasible; see
18521 # is_code_point_usable() in mktables for details.
18522
18523 # Standard test packages are not used because this manipulates SIG_WARN.  It
18524 # exits 0 if every non-skipped test succeeded; -1 if any failed.
18525
18526 my $Tests = 0;
18527 my $Fails = 0;
18528
18529 sub Expect($$$$) {
18530     my $expected = shift;
18531     my $ord = shift;
18532     my $regex  = shift;
18533     my $warning_type = shift;   # Type of warning message, like 'deprecated'
18534                                 # or empty if none
18535     my $line   = (caller)[2];
18536
18537     # Convert the code point to hex form
18538     my $string = sprintf "\"\\x{%04X}\"", $ord;
18539
18540     my @tests = "";
18541
18542     # The first time through, use all warnings.  If the input should generate
18543     # a warning, add another time through with them turned off
18544     push @tests, "no warnings '$warning_type';" if $warning_type;
18545
18546     foreach my $no_warnings (@tests) {
18547
18548         # Store any warning messages instead of outputting them
18549         local $SIG{__WARN__} = $SIG{__WARN__};
18550         my $warning_message;
18551         $SIG{__WARN__} = sub { $warning_message = $_[0] };
18552
18553         $Tests++;
18554
18555         # A string eval is needed because of the 'no warnings'.
18556         # Assumes no parens in the regular expression
18557         my $result = eval "$no_warnings
18558                             my \$RegObj = qr($regex);
18559                             $string =~ \$RegObj ? 1 : 0";
18560         if (not defined $result) {
18561             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
18562             $Fails++;
18563         }
18564         elsif ($result ^ $expected) {
18565             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
18566             $Fails++;
18567         }
18568         elsif ($warning_message) {
18569             if (! $warning_type || ($warning_type && $no_warnings)) {
18570                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
18571                 $Fails++;
18572             }
18573             else {
18574                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
18575             }
18576         }
18577         elsif ($warning_type && ! $no_warnings) {
18578             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
18579             $Fails++;
18580         }
18581         else {
18582             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
18583         }
18584     }
18585     return;
18586 }
18587
18588 sub Error($) {
18589     my $regex  = shift;
18590     $Tests++;
18591     if (eval { 'x' =~ qr/$regex/; 1 }) {
18592         $Fails++;
18593         my $line = (caller)[2];
18594         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
18595     }
18596     else {
18597         my $line = (caller)[2];
18598         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
18599     }
18600     return;
18601 }
18602
18603 # GCBTest.txt character that separates grapheme clusters
18604 my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
18605 utf8::upgrade($breakable_utf8);
18606
18607 # GCBTest.txt character that indicates that the adjoining code points are part
18608 # of the same grapheme cluster
18609 my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
18610 utf8::upgrade($nobreak_utf8);
18611
18612 sub Test_X($) {
18613     # Test qr/\X/ matches.  The input is a line from auxiliary/GCBTest.txt
18614     # Each such line is a sequence of code points given by their hex numbers,
18615     # separated by the two characters defined just before this subroutine that
18616     # indicate that either there can or cannot be a break between the adjacent
18617     # code points.  If there isn't a break, that means the sequence forms an
18618     # extended grapheme cluster, which means that \X should match the whole
18619     # thing.  If there is a break, \X should stop there.  This is all
18620     # converted by this routine into a match:
18621     #   $string =~ /(\X)/,
18622     # Each \X should match the next cluster; and that is what is checked.
18623
18624     my $template = shift;
18625
18626     my $line   = (caller)[2];
18627
18628     # The line contains characters above the ASCII range, but in Latin1.  It
18629     # may or may not be in utf8, and if it is, it may or may not know it.  So,
18630     # convert these characters to 8 bits.  If knows is in utf8, simply
18631     # downgrade.
18632     if (utf8::is_utf8($template)) {
18633         utf8::downgrade($template);
18634     } else {
18635
18636         # Otherwise, if it is in utf8, but doesn't know it, the next lines
18637         # convert the two problematic characters to their 8-bit equivalents.
18638         # If it isn't in utf8, they don't harm anything.
18639         use bytes;
18640         $template =~ s/$nobreak_utf8/$nobreak/g;
18641         $template =~ s/$breakable_utf8/$breakable/g;
18642     }
18643
18644     # Get rid of the leading and trailing breakables
18645     $template =~ s/^ \s* $breakable \s* //x;
18646     $template =~ s/ \s* $breakable \s* $ //x;
18647
18648     # And no-breaks become just a space.
18649     $template =~ s/ \s* $nobreak \s* / /xg;
18650
18651     # Split the input into segments that are breakable between them.
18652     my @segments = split /\s*$breakable\s*/, $template;
18653
18654     my $string = "";
18655     my $display_string = "";
18656     my @should_match;
18657     my @should_display;
18658
18659     # Convert the code point sequence in each segment into a Perl string of
18660     # characters
18661     foreach my $segment (@segments) {
18662         my @code_points = split /\s+/, $segment;
18663         my $this_string = "";
18664         my $this_display = "";
18665         foreach my $code_point (@code_points) {
18666             $this_string .= latin1_to_native(chr(hex $code_point));
18667             $this_display .= "\\x{$code_point}";
18668         }
18669
18670         # The next cluster should match the string in this segment.
18671         push @should_match, $this_string;
18672         push @should_display, $this_display;
18673         $string .= $this_string;
18674         $display_string .= $this_display;
18675     }
18676
18677     # If a string can be represented in both non-ut8 and utf8, test both cases
18678     UPGRADE:
18679     for my $to_upgrade (0 .. 1) {
18680
18681         if ($to_upgrade) {
18682
18683             # If already in utf8, would just be a repeat
18684             next UPGRADE if utf8::is_utf8($string);
18685
18686             utf8::upgrade($string);
18687         }
18688
18689         # Finally, do the \X match.
18690         my @matches = $string =~ /(\X)/g;
18691
18692         # Look through each matched cluster to verify that it matches what we
18693         # expect.
18694         my $min = (@matches < @should_match) ? @matches : @should_match;
18695         for my $i (0 .. $min - 1) {
18696             $Tests++;
18697             if ($matches[$i] eq $should_match[$i]) {
18698                 print "ok $Tests - ";
18699                 if ($i == 0) {
18700                     print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
18701                 } else {
18702                     print "And \\X #", $i + 1,
18703                 }
18704                 print " correctly matched $should_display[$i]; line $line\n";
18705             } else {
18706                 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
18707                                                     unpack("U*", $matches[$i]));
18708                 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
18709                     $i + 1,
18710                     " should have matched $should_display[$i]",
18711                     " but instead matched $matches[$i]",
18712                     ".  Abandoning rest of line $line\n";
18713                 next UPGRADE;
18714             }
18715         }
18716
18717         # And the number of matches should equal the number of expected matches.
18718         $Tests++;
18719         if (@matches == @should_match) {
18720             print "ok $Tests - Nothing was left over; line $line\n";
18721         } else {
18722             print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
18723         }
18724     }
18725
18726     return;
18727 }
18728
18729 sub Finished() {
18730     print "1..$Tests\n";
18731     exit($Fails ? -1 : 0);
18732 }
18733
18734 Error('\p{Script=InGreek}');    # Bug #69018
18735 Test_X("1100 $nobreak 1161");  # Bug #70940
18736 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18737 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18738 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726