lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # Needs 'no overloading' to run faster on miniperl.  Code commented out at the
   8 # subroutine objaddr can be used instead to work as far back (untested) as
   9 # 5.8: needs pack "U".  But almost all occurrences of objaddr have been
  10 # removed in favor of using 'no overloading'.  You also would have to go
  11 # through and replace occurrences like:
  12 #       my $addr = do { no overloading; pack 'J', $self; }
  13 # with
  14 #       my $addr = main::objaddr $self;
  15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
  16 # that instituted the change to main::objaddr, and subsequent commits that
  17 # changed 0+$self to pack 'J', $self.)
  18
  19 my $start_time;
  20 BEGIN { # Get the time the script started running; do it at compilation to
  21         # get it as close as possible
  22     $start_time= time;
  23 }
  24
  25 require 5.010_001;
  26 use strict;
  27 use warnings;
  28 use Carp;
  29 use Config;
  30 use File::Find;
  31 use File::Path;
  32 use File::Spec;
  33 use Text::Tabs;
  34 use re "/aa";
  35
  36 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  38
  39 sub NON_ASCII_PLATFORM { ord("A") != 65 }
  40
  41 ##########################################################################
  42 #
  43 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  44 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  45 # a pod file and .t files, depending on option parameters.
  46 #
  47 # The structure of this file is:
  48 #   First these introductory comments; then
  49 #   code needed for everywhere, such as debugging stuff; then
  50 #   code to handle input parameters; then
  51 #   data structures likely to be of external interest (some of which depend on
  52 #       the input parameters, so follows them; then
  53 #   more data structures and subroutine and package (class) definitions; then
  54 #   the small actual loop to process the input files and finish up; then
  55 #   a __DATA__ section, for the .t tests
  56 #
  57 # This program works on all releases of Unicode so far.  The outputs have been
  58 # scrutinized most intently for release 5.1.  The others have been checked for
  59 # somewhat more than just sanity.  It can handle all non-provisional Unicode
  60 # character properties in those releases.
  61 #
  62 # This program is mostly about Unicode character (or code point) properties.
  63 # A property describes some attribute or quality of a code point, like if it
  64 # is lowercase or not, its name, what version of Unicode it was first defined
  65 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  66 # possibilities by making all properties into mappings from each code point
  67 # into some corresponding value.  In the case of it being lowercase or not,
  68 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  69 # property maps each Unicode code point to a single value, called a "property
  70 # value".  (Some more recently defined properties, map a code point to a set
  71 # of values.)
  72 #
  73 # When using a property in a regular expression, what is desired isn't the
  74 # mapping of the code point to its property's value, but the reverse (or the
  75 # mathematical "inverse relation"): starting with the property value, "Does a
  76 # code point map to it?"  These are written in a "compound" form:
  77 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  78 # files containing the lists of code points that map to each such regular
  79 # expression property value, one file per list
  80 #
  81 # There is also a single form shortcut that Perl adds for many of the commonly
  82 # used properties.  This happens for all binary properties, plus script,
  83 # general_category, and block properties.
  84 #
  85 # Thus the outputs of this program are files.  There are map files, mostly in
  86 # the 'To' directory; and there are list files for use in regular expression
  87 # matching, all in subdirectories of the 'lib' directory, with each
  88 # subdirectory being named for the property that the lists in it are for.
  89 # Bookkeeping, test, and documentation files are also generated.
  90
  91 my $matches_directory = 'lib';   # Where match (\p{}) files go.
  92 my $map_directory = 'To';        # Where map files go.
  93
  94 # DATA STRUCTURES
  95 #
  96 # The major data structures of this program are Property, of course, but also
  97 # Table.  There are two kinds of tables, very similar to each other.
  98 # "Match_Table" is the data structure giving the list of code points that have
  99 # a particular property value, mentioned above.  There is also a "Map_Table"
 100 # data structure which gives the property's mapping from code point to value.
 101 # There are two structures because the match tables need to be combined in
 102 # various ways, such as constructing unions, intersections, complements, etc.,
 103 # and the map ones don't.  And there would be problems, perhaps subtle, if
 104 # a map table were inadvertently operated on in some of those ways.
 105 # The use of separate classes with operations defined on one but not the other
 106 # prevents accidentally confusing the two.
 107 #
 108 # At the heart of each table's data structure is a "Range_List", which is just
 109 # an ordered list of "Ranges", plus ancillary information, and methods to
 110 # operate on them.  A Range is a compact way to store property information.
 111 # Each range has a starting code point, an ending code point, and a value that
 112 # is meant to apply to all the code points between the two end points,
 113 # inclusive.  For a map table, this value is the property value for those
 114 # code points.  Two such ranges could be written like this:
 115 #   0x41 .. 0x5A, 'Upper',
 116 #   0x61 .. 0x7A, 'Lower'
 117 #
 118 # Each range also has a type used as a convenience to classify the values.
 119 # Most ranges in this program will be Type 0, or normal, but there are some
 120 # ranges that have a non-zero type.  These are used only in map tables, and
 121 # are for mappings that don't fit into the normal scheme of things.  Mappings
 122 # that require a hash entry to communicate with utf8.c are one example;
 123 # another example is mappings for charnames.pm to use which indicate a name
 124 # that is algorithmically determinable from its code point (and the reverse).
 125 # These are used to significantly compact these tables, instead of listing
 126 # each one of the tens of thousands individually.
 127 #
 128 # In a match table, the value of a range is irrelevant (and hence the type as
 129 # well, which will always be 0), and arbitrarily set to the null string.
 130 # Using the example above, there would be two match tables for those two
 131 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 132 # named Lower would contain 0x61..0x7A.
 133 #
 134 # Actually, there are two types of range lists, "Range_Map" is the one
 135 # associated with map tables, and "Range_List" with match tables.
 136 # Again, this is so that methods can be defined on one and not the others so
 137 # as to prevent operating on them in incorrect ways.
 138 #
 139 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 140 # in the perl core.  All tables could in theory be written, but some are
 141 # suppressed because there is no current practical use for them.  It is easy
 142 # to change which get written by changing various lists that are near the top
 143 # of the actual code in this file.  The table data structures contain enough
 144 # ancillary information to allow them to be treated as separate entities for
 145 # writing, such as the path to each one's file.  There is a heading in each
 146 # map table that gives the format of its entries, and what the map is for all
 147 # the code points missing from it.  (This allows tables to be more compact.)
 148 #
 149 # The Property data structure contains one or more tables.  All properties
 150 # contain a map table (except the $perl property which is a
 151 # pseudo-property containing only match tables), and any properties that
 152 # are usable in regular expression matches also contain various matching
 153 # tables, one for each value the property can have.  A binary property can
 154 # have two values, True and False (or Y and N, which are preferred by Unicode
 155 # terminology).  Thus each of these properties will have a map table that
 156 # takes every code point and maps it to Y or N (but having ranges cuts the
 157 # number of entries in that table way down), and two match tables, one
 158 # which has a list of all the code points that map to Y, and one for all the
 159 # code points that map to N.  (For each binary property, a third table is also
 160 # generated for the pseudo Perl property.  It contains the identical code
 161 # points as the Y table, but can be written in regular expressions, not in the
 162 # compound form, but in a "single" form like \p{IsUppercase}.)  Many
 163 # properties are binary, but some properties have several possible values,
 164 # some have many, and properties like Name have a different value for every
 165 # named code point.  Those will not, unless the controlling lists are changed,
 166 # have their match tables written out.  But all the ones which can be used in
 167 # regular expression \p{} and \P{} constructs will.  Prior to 5.14, generally
 168 # a property would have either its map table or its match tables written but
 169 # not both.  Again, what gets written is controlled by lists which can easily
 170 # be changed.  Starting in 5.14, advantage was taken of this, and all the map
 171 # tables needed to reconstruct the Unicode db are now written out, while
 172 # suppressing the Unicode .txt files that contain the data.  Our tables are
 173 # much more compact than the .txt files, so a significant space savings was
 174 # achieved.  Also, tables are not written out that are trivially derivable
 175 # from tables that do get written.  So, there typically is no file containing
 176 # the code points not matched by a binary property (the table for \P{} versus
 177 # lowercase \p{}), since you just need to invert the True table to get the
 178 # False table.
 179
 180 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
 181 # how many match tables there are and the content of the maps.  This 'Type' is
 182 # different than a range 'Type', so don't get confused by the two concepts
 183 # having the same name.
 184 #
 185 # For information about the Unicode properties, see Unicode's UAX44 document:
 186
 187 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 188
 189 # As stated earlier, this program will work on any release of Unicode so far.
 190 # Most obvious problems in earlier data have NOT been corrected except when
 191 # necessary to make Perl or this program work reasonably, and to keep out
 192 # potential security issues.  For example, no folding information was given in
 193 # early releases, so this program substitutes lower case instead, just so that
 194 # a regular expression with the /i option will do something that actually
 195 # gives the right results in many cases.  There are also a couple other
 196 # corrections for version 1.1.5, commented at the point they are made.  As an
 197 # example of corrections that weren't made (but could be) is this statement
 198 # from DerivedAge.txt: "The supplementary private use code points and the
 199 # non-character code points were assigned in version 2.0, but not specifically
 200 # listed in the UCD until versions 3.0 and 3.1 respectively."  (To be precise
 201 # it was 3.0.1 not 3.0.0)  More information on Unicode version glitches is
 202 # further down in these introductory comments.
 203 #
 204 # This program works on all non-provisional properties as of the current
 205 # Unicode release, though the files for some are suppressed for various
 206 # reasons.  You can change which are output by changing lists in this program.
 207 #
 208 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
 209 # loose matchings rules (from Unicode TR18):
 210 #
 211 #    The recommended names for UCD properties and property values are in
 212 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 213 #    [PropValue]. There are both abbreviated names and longer, more
 214 #    descriptive names. It is strongly recommended that both names be
 215 #    recognized, and that loose matching of property names be used,
 216 #    whereby the case distinctions, whitespace, hyphens, and underbar
 217 #    are ignored.
 218 #
 219 # The program still allows Fuzzy to override its determination of if loose
 220 # matching should be used, but it isn't currently used, as it is no longer
 221 # needed; the calculations it makes are good enough.
 222 #
 223 # SUMMARY OF HOW IT WORKS:
 224 #
 225 #   Process arguments
 226 #
 227 #   A list is constructed containing each input file that is to be processed
 228 #
 229 #   Each file on the list is processed in a loop, using the associated handler
 230 #   code for each:
 231 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 232 #            first.  These files name the properties and property values.
 233 #            Objects are created of all the property and property value names
 234 #            that the rest of the input should expect, including all synonyms.
 235 #        The other input files give mappings from properties to property
 236 #           values.  That is, they list code points and say what the mapping
 237 #           is under the given property.  Some files give the mappings for
 238 #           just one property; and some for many.  This program goes through
 239 #           each file and populates the properties and their map tables from
 240 #           them.  Some properties are listed in more than one file, and
 241 #           Unicode has set up a precedence as to which has priority if there
 242 #           is a conflict.  Thus the order of processing matters, and this
 243 #           program handles the conflict possibility by processing the
 244 #           overriding input files last, so that if necessary they replace
 245 #           earlier values.
 246 #        After this is all done, the program creates the property mappings not
 247 #            furnished by Unicode, but derivable from what it does give.
 248 #        The tables of code points that match each property value in each
 249 #            property that is accessible by regular expressions are created.
 250 #        The Perl-defined properties are created and populated.  Many of these
 251 #            require data determined from the earlier steps
 252 #        Any Perl-defined synonyms are created, and name clashes between Perl
 253 #            and Unicode are reconciled and warned about.
 254 #        All the properties are written to files
 255 #        Any other files are written, and final warnings issued.
 256 #
 257 # For clarity, a number of operators have been overloaded to work on tables:
 258 #   ~ means invert (take all characters not in the set).  The more
 259 #       conventional '!' is not used because of the possibility of confusing
 260 #       it with the actual boolean operation.
 261 #   + means union
 262 #   - means subtraction
 263 #   & means intersection
 264 # The precedence of these is the order listed.  Parentheses should be
 265 # copiously used.  These are not a general scheme.  The operations aren't
 266 # defined for a number of things, deliberately, to avoid getting into trouble.
 267 # Operations are done on references and affect the underlying structures, so
 268 # that the copy constructors for them have been overloaded to not return a new
 269 # clone, but the input object itself.
 270 #
 271 # The bool operator is deliberately not overloaded to avoid confusion with
 272 # "should it mean if the object merely exists, or also is non-empty?".
 273 #
 274 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 275 #
 276 # This program needs to be able to run under miniperl.  Therefore, it uses a
 277 # minimum of other modules, and hence implements some things itself that could
 278 # be gotten from CPAN
 279 #
 280 # This program uses inputs published by the Unicode Consortium.  These can
 281 # change incompatibly between releases without the Perl maintainers realizing
 282 # it.  Therefore this program is now designed to try to flag these.  It looks
 283 # at the directories where the inputs are, and flags any unrecognized files.
 284 # It keeps track of all the properties in the files it handles, and flags any
 285 # that it doesn't know how to handle.  It also flags any input lines that
 286 # don't match the expected syntax, among other checks.
 287 #
 288 # It is also designed so if a new input file matches one of the known
 289 # templates, one hopefully just needs to add it to a list to have it
 290 # processed.
 291 #
 292 # As mentioned earlier, some properties are given in more than one file.  In
 293 # particular, the files in the extracted directory are supposedly just
 294 # reformattings of the others.  But they contain information not easily
 295 # derivable from the other files, including results for Unihan, which this
 296 # program doesn't ordinarily look at, and for unassigned code points.  They
 297 # also have historically had errors or been incomplete.  In an attempt to
 298 # create the best possible data, this program thus processes them first to
 299 # glean information missing from the other files; then processes those other
 300 # files to override any errors in the extracted ones.  Much of the design was
 301 # driven by this need to store things and then possibly override them.
 302 #
 303 # It tries to keep fatal errors to a minimum, to generate something usable for
 304 # testing purposes.  It always looks for files that could be inputs, and will
 305 # warn about any that it doesn't know how to handle (the -q option suppresses
 306 # the warning).
 307 #
 308 # Why is there more than one type of range?
 309 #   This simplified things.  There are some very specialized code points that
 310 #   have to be handled specially for output, such as Hangul syllable names.
 311 #   By creating a range type (done late in the development process), it
 312 #   allowed this to be stored with the range, and overridden by other input.
 313 #   Originally these were stored in another data structure, and it became a
 314 #   mess trying to decide if a second file that was for the same property was
 315 #   overriding the earlier one or not.
 316 #
 317 # Why are there two kinds of tables, match and map?
 318 #   (And there is a base class shared by the two as well.)  As stated above,
 319 #   they actually are for different things.  Development proceeded much more
 320 #   smoothly when I (khw) realized the distinction.  Map tables are used to
 321 #   give the property value for every code point (actually every code point
 322 #   that doesn't map to a default value).  Match tables are used for regular
 323 #   expression matches, and are essentially the inverse mapping.  Separating
 324 #   the two allows more specialized methods, and error checks so that one
 325 #   can't just take the intersection of two map tables, for example, as that
 326 #   is nonsensical.
 327 #
 328 # What about 'fate' and 'status'.  The concept of a table's fate was created
 329 #   late when it became clear that something more was needed.  The difference
 330 #   between this and 'status' is unclean, and could be improved if someone
 331 #   wanted to spend the effort.
 332 #
 333 # DEBUGGING
 334 #
 335 # This program is written so it will run under miniperl.  Occasionally changes
 336 # will cause an error where the backtrace doesn't work well under miniperl.
 337 # To diagnose the problem, you can instead run it under regular perl, if you
 338 # have one compiled.
 339 #
 340 # There is a good trace facility.  To enable it, first sub DEBUG must be set
 341 # to return true.  Then a line like
 342 #
 343 # local $to_trace = 1 if main::DEBUG;
 344 #
 345 # can be added to enable tracing in its lexical scope (plus dynamic) or until
 346 # you insert another line:
 347 #
 348 # local $to_trace = 0 if main::DEBUG;
 349 #
 350 # To actually trace, use a line like "trace $a, @b, %c, ...;
 351 #
 352 # Some of the more complex subroutines already have trace statements in them.
 353 # Permanent trace statements should be like:
 354 #
 355 # trace ... if main::DEBUG && $to_trace;
 356 #
 357 # If there is just one or a few files that you're debugging, you can easily
 358 # cause most everything else to be skipped.  Change the line
 359 #
 360 # my $debug_skip = 0;
 361 #
 362 # to 1, and every file whose object is in @input_file_objects and doesn't have
 363 # a, 'non_skip => 1,' in its constructor will be skipped.  However, skipping
 364 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
 365 #
 366 # To compare the output tables, it may be useful to specify the -annotate
 367 # flag.  This causes the tables to expand so there is one entry for each
 368 # non-algorithmically named code point giving, currently its name, and its
 369 # graphic representation if printable (and you have a font that knows about
 370 # it).  This makes it easier to see what the particular code points are in
 371 # each output table.  The tables are usable, but because they don't have
 372 # ranges (for the most part), a Perl using them will run slower.  Non-named
 373 # code points are annotated with a description of their status, and contiguous
 374 # ones with the same description will be output as a range rather than
 375 # individually.  Algorithmically named characters are also output as ranges,
 376 # except when there are just a few contiguous ones.
 377 #
 378 # FUTURE ISSUES
 379 #
 380 # The program would break if Unicode were to change its names so that
 381 # interior white space, underscores, or dashes differences were significant
 382 # within property and property value names.
 383 #
 384 # It might be easier to use the xml versions of the UCD if this program ever
 385 # would need heavy revision, and the ability to handle old versions was not
 386 # required.
 387 #
 388 # There is the potential for name collisions, in that Perl has chosen names
 389 # that Unicode could decide it also likes.  There have been such collisions in
 390 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 391 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 392 # collisions, which were withdrawn before the final release, because of Perl's
 393 # and other's protests.  These all involved new properties which began with
 394 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 395 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 396 # Unicode document, so they are unlikely to be used by Unicode for another
 397 # purpose.  However, they might try something beginning with 'In', or use any
 398 # of the other Perl-defined properties.  This program will warn you of name
 399 # collisions, and refuse to generate tables with them, but manual intervention
 400 # will be required in this event.  One scheme that could be implemented, if
 401 # necessary, would be to have this program generate another file, or add a
 402 # field to mktables.lst that gives the date of first definition of a property.
 403 # Each new release of Unicode would use that file as a basis for the next
 404 # iteration.  And the Perl synonym addition code could sort based on the age
 405 # of the property, so older properties get priority, and newer ones that clash
 406 # would be refused; hence existing code would not be impacted, and some other
 407 # synonym would have to be used for the new property.  This is ugly, and
 408 # manual intervention would certainly be easier to do in the short run; lets
 409 # hope it never comes to this.
 410 #
 411 # A NOTE ON UNIHAN
 412 #
 413 # This program can generate tables from the Unihan database.  But it doesn't
 414 # by default, letting the CPAN module Unicode::Unihan handle them.  Prior to
 415 # version 5.2, this database was in a single file, Unihan.txt.  In 5.2 the
 416 # database was split into 8 different files, all beginning with the letters
 417 # 'Unihan'.  This program will read those file(s) if present, but it needs to
 418 # know which of the many properties in the file(s) should have tables created
 419 # for them.  It will create tables for any properties listed in
 420 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
 421 # @cjk_properties array and the @cjk_property_values array.  Thus, if a
 422 # property you want is not in those files of the release you are building
 423 # against, you must add it to those two arrays.  Starting in 4.0, the
 424 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
 425 # is present in the directory, a table will be generated for that property.
 426 # In 5.2, several more properties were added.  For your convenience, the two
 427 # arrays are initialized with all the 6.0 listed properties that are also in
 428 # earlier releases.  But these are commented out.  You can just uncomment the
 429 # ones you want, or use them as a template for adding entries for other
 430 # properties.
 431 #
 432 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 433 # and filter_unihan_line() are the functions where this is done.  This program
 434 # already does some adjusting to make the lines look more like the rest of the
 435 # Unicode DB;  You can see what that is in filter_unihan_line()
 436 #
 437 # There is a bug in the 3.2 data file in which some values for the
 438 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 439 # could be added for these; or for a particular installation, the Unihan.txt
 440 # file could be edited to fix them.
 441 #
 442 # HOW TO ADD A FILE TO BE PROCESSED
 443 #
 444 # A new file from Unicode needs to have an object constructed for it in
 445 # @input_file_objects, probably at the end or at the end of the extracted
 446 # ones.  The program should warn you if its name will clash with others on
 447 # restrictive file systems, like DOS.  If so, figure out a better name, and
 448 # add lines to the README.perl file giving that.  If the file is a character
 449 # property, it should be in the format that Unicode has implicitly
 450 # standardized for such files for the more recently introduced ones.
 451 # If so, the Input_file constructor for @input_file_objects can just be the
 452 # file name and release it first appeared in.  If not, then it should be
 453 # possible to construct an each_line_handler() to massage the line into the
 454 # standardized form.
 455 #
 456 # For non-character properties, more code will be needed.  You can look at
 457 # the existing entries for clues.
 458 #
 459 # UNICODE VERSIONS NOTES
 460 #
 461 # The Unicode UCD has had a number of errors in it over the versions.  And
 462 # these remain, by policy, in the standard for that version.  Therefore it is
 463 # risky to correct them, because code may be expecting the error.  So this
 464 # program doesn't generally make changes, unless the error breaks the Perl
 465 # core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
 466 # for U+1105, which causes real problems for the algorithms for Jamo
 467 # calculations, so it is changed here.
 468 #
 469 # But it isn't so clear cut as to what to do about concepts that are
 470 # introduced in a later release; should they extend back to earlier releases
 471 # where the concept just didn't exist?  It was easier to do this than to not,
 472 # so that's what was done.  For example, the default value for code points not
 473 # in the files for various properties was probably undefined until changed by
 474 # some version.  No_Block for blocks is such an example.  This program will
 475 # assign No_Block even in Unicode versions that didn't have it.  This has the
 476 # benefit that code being written doesn't have to special case earlier
 477 # versions; and the detriment that it doesn't match the Standard precisely for
 478 # the affected versions.
 479 #
 480 # Here are some observations about some of the issues in early versions:
 481 #
 482 # Prior to version 3.0, there were 3 character decompositions.  These are not
 483 # handled by Unicode::Normalize, nor will it compile when presented a version
 484 # that has them.  However, you can trivially get it to compile by simply
 485 # ignoring those decompositions, by changing the croak to a carp.  At the time
 486 # of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
 487 #
 488 #   croak("Weird Canonical Decomposition of U+$h");
 489 #
 490 # Simply change to a carp.  It will compile, but will not know about any three
 491 # character decomposition.
 492
 493 # The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
 494 # that the reason is that the CJK block starting at 4E00 was removed from
 495 # PropList, and was not put back in until 3.1.0.  The Perl extension (the
 496 # single property name \p{alpha}) has the correct values.  But the compound
 497 # form is simply not generated until 3.1, as it can be argued that prior to
 498 # this release, this was not an official property.  The comments for
 499 # filter_old_style_proplist() give more details.
 500 #
 501 # Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
 502 # always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
 503 # reason is that 3.2 introduced U+205F=medium math space, which was not
 504 # classed as white space, but Perl figured out that it should have been. 4.0
 505 # reclassified it correctly.
 506 #
 507 # Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
 508 # this was erroneously a synonym for 202 (it should be 200).  In 4.0, ATB
 509 # became 202, and ATBL was left with no code points, as all the ones that
 510 # mapped to 202 stayed mapped to 202.  Thus if your program used the numeric
 511 # name for the class, it would not have been affected, but if it used the
 512 # mnemonic, it would have been.
 513 #
 514 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that code
 515 # points which eventually came to have this script property value, instead
 516 # mapped to "Unknown".  But in the next release all these code points were
 517 # moved to \p{sc=common} instead.
 518 #
 519 # The default for missing code points for BidiClass is complicated.  Starting
 520 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 521 # tries to do the best it can for earlier releases.  It is done in
 522 # process_PropertyAliases()
 523 #
 524 # In version 2.1.2, the entry in UnicodeData.txt:
 525 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
 526 # should instead be
 527 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
 528 # Without this change, there are casing problems for this character.
 529 #
 530 ##############################################################################
 531
 532 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 533                         # and errors
 534 my $MAX_LINE_WIDTH = 78;
 535
 536 # Debugging aid to skip most files so as to not be distracted by them when
 537 # concentrating on the ones being debugged.  Add
 538 # non_skip => 1,
 539 # to the constructor for those files you want processed when you set this.
 540 # Files with a first version number of 0 are special: they are always
 541 # processed regardless of the state of this flag.  Generally, Jamo.txt and
 542 # UnicodeData.txt must not be skipped if you want this program to not die
 543 # before normal completion.
 544 my $debug_skip = 0;
 545
 546
 547 # Normally these are suppressed.
 548 my $write_Unicode_deprecated_tables = 0;
 549
 550 # Set to 1 to enable tracing.
 551 our $to_trace = 0;
 552
 553 { # Closure for trace: debugging aid
 554     my $print_caller = 1;        # ? Include calling subroutine name
 555     my $main_with_colon = 'main::';
 556     my $main_colon_length = length($main_with_colon);
 557
 558     sub trace {
 559         return unless $to_trace;        # Do nothing if global flag not set
 560
 561         my @input = @_;
 562
 563         local $DB::trace = 0;
 564         $DB::trace = 0;          # Quiet 'used only once' message
 565
 566         my $line_number;
 567
 568         # Loop looking up the stack to get the first non-trace caller
 569         my $caller_line;
 570         my $caller_name;
 571         my $i = 0;
 572         do {
 573             $line_number = $caller_line;
 574             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 575             $caller = $main_with_colon unless defined $caller;
 576
 577             $caller_name = $caller;
 578
 579             # get rid of pkg
 580             $caller_name =~ s/.*:://;
 581             if (substr($caller_name, 0, $main_colon_length)
 582                 eq $main_with_colon)
 583             {
 584                 $caller_name = substr($caller_name, $main_colon_length);
 585             }
 586
 587         } until ($caller_name ne 'trace');
 588
 589         # If the stack was empty, we were called from the top level
 590         $caller_name = 'main' if ($caller_name eq ""
 591                                     || $caller_name eq 'trace');
 592
 593         my $output = "";
 594         foreach my $string (@input) {
 595             #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 596             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 597                 $output .= simple_dumper($string);
 598             }
 599             else {
 600                 $string = "$string" if ref $string;
 601                 $string = $UNDEF unless defined $string;
 602                 chomp $string;
 603                 $string = '""' if $string eq "";
 604                 $output .= " " if $output ne ""
 605                                 && $string ne ""
 606                                 && substr($output, -1, 1) ne " "
 607                                 && substr($string, 0, 1) ne " ";
 608                 $output .= $string;
 609             }
 610         }
 611
 612         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 613         print STDERR "$caller_name: " if $print_caller;
 614         print STDERR $output, "\n";
 615         return;
 616     }
 617 }
 618
 619 # This is for a rarely used development feature that allows you to compare two
 620 # versions of the Unicode standard without having to deal with changes caused
 621 # by the code points introduced in the later version.  Change the 0 to a
 622 # string containing a SINGLE dotted Unicode release number (e.g. "2.1").  Only
 623 # code points introduced in that release and earlier will be used; later ones
 624 # are thrown away.  You use the version number of the earliest one you want to
 625 # compare; then run this program on directory structures containing each
 626 # release, and compare the outputs.  These outputs will therefore include only
 627 # the code points common to both releases, and you can see the changes caused
 628 # just by the underlying release semantic changes.  For versions earlier than
 629 # 3.2, you must copy a version of DAge.txt into the directory.
 630 my $string_compare_versions = DEBUG && 0; #  e.g., "2.1";
 631 my $compare_versions = DEBUG
 632                        && $string_compare_versions
 633                        && pack "C*", split /\./, $string_compare_versions;
 634
 635 sub uniques {
 636     # Returns non-duplicated input values.  From "Perl Best Practices:
 637     # Encapsulated Cleverness".  p. 455 in first edition.
 638
 639     my %seen;
 640     # Arguably this breaks encapsulation, if the goal is to permit multiple
 641     # distinct objects to stringify to the same value, and be interchangeable.
 642     # However, for this program, no two objects stringify identically, and all
 643     # lists passed to this function are either objects or strings. So this
 644     # doesn't affect correctness, but it does give a couple of percent speedup.
 645     no overloading;
 646     return grep { ! $seen{$_}++ } @_;
 647 }
 648
 649 $0 = File::Spec->canonpath($0);
 650
 651 my $make_test_script = 0;      # ? Should we output a test script
 652 my $make_norm_test_script = 0; # ? Should we output a normalization test script
 653 my $write_unchanged_files = 0; # ? Should we update the output files even if
 654                                #    we don't think they have changed
 655 my $use_directory = "";        # ? Should we chdir somewhere.
 656 my $pod_directory;             # input directory to store the pod file.
 657 my $pod_file = 'perluniprops';
 658 my $t_path;                     # Path to the .t test file
 659 my $file_list = 'mktables.lst'; # File to store input and output file names.
 660                                # This is used to speed up the build, by not
 661                                # executing the main body of the program if
 662                                # nothing on the list has changed since the
 663                                # previous build
 664 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 665                                # make a list so that when the pumpking is
 666                                # preparing a release, s/he won't have to do
 667                                # special things
 668 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 669                                # in the input.
 670 my $output_range_counts = $debugging_build;   # ? Should we include the number
 671                                               # of code points in ranges in
 672                                               # the output
 673 my $annotate = 0;              # ? Should character names be in the output
 674
 675 # Verbosity levels; 0 is quiet
 676 my $NORMAL_VERBOSITY = 1;
 677 my $PROGRESS = 2;
 678 my $VERBOSE = 3;
 679
 680 my $verbosity = $NORMAL_VERBOSITY;
 681
 682 # Stored in mktables.lst so that if this program is called with different
 683 # options, will regenerate even if the files otherwise look like they're
 684 # up-to-date.
 685 my $command_line_arguments = join " ", @ARGV;
 686
 687 # Process arguments
 688 while (@ARGV) {
 689     my $arg = shift @ARGV;
 690     if ($arg eq '-v') {
 691         $verbosity = $VERBOSE;
 692     }
 693     elsif ($arg eq '-p') {
 694         $verbosity = $PROGRESS;
 695         $| = 1;     # Flush buffers as we go.
 696     }
 697     elsif ($arg eq '-q') {
 698         $verbosity = 0;
 699     }
 700     elsif ($arg eq '-w') {
 701         $write_unchanged_files = 1; # update the files even if havent changed
 702     }
 703     elsif ($arg eq '-check') {
 704         my $this = shift @ARGV;
 705         my $ok = shift @ARGV;
 706         if ($this ne $ok) {
 707             print "Skipping as check params are not the same.\n";
 708             exit(0);
 709         }
 710     }
 711     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 712         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 713     }
 714     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 715     {
 716         $make_test_script = 1;
 717     }
 718     elsif ($arg eq '-makenormtest')
 719     {
 720         $make_norm_test_script = 1;
 721     }
 722     elsif ($arg eq '-makelist') {
 723         $make_list = 1;
 724     }
 725     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 726         -d $use_directory or croak "Unknown directory '$use_directory'";
 727     }
 728     elsif ($arg eq '-L') {
 729
 730         # Existence not tested until have chdir'd
 731         $file_list = shift;
 732     }
 733     elsif ($arg eq '-globlist') {
 734         $glob_list = 1;
 735     }
 736     elsif ($arg eq '-c') {
 737         $output_range_counts = ! $output_range_counts
 738     }
 739     elsif ($arg eq '-annotate') {
 740         $annotate = 1;
 741         $debugging_build = 1;
 742         $output_range_counts = 1;
 743     }
 744     else {
 745         my $with_c = 'with';
 746         $with_c .= 'out' if $output_range_counts;   # Complements the state
 747         croak <<END;
 748 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 749           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 750           [-check A B ]
 751   -c          : Output comments $with_c number of code points in ranges
 752   -q          : Quiet Mode: Only output serious warnings.
 753   -p          : Set verbosity level to normal plus show progress.
 754   -v          : Set Verbosity level high:  Show progress and non-serious
 755                 warnings
 756   -w          : Write files regardless
 757   -C dir      : Change to this directory before proceeding. All relative paths
 758                 except those specified by the -P and -T options will be done
 759                 with respect to this directory.
 760   -P dir      : Output $pod_file file to directory 'dir'.
 761   -T path     : Create a test script as 'path'; overrides -maketest
 762   -L filelist : Use alternate 'filelist' instead of standard one
 763   -globlist   : Take as input all non-Test *.txt files in current and sub
 764                 directories
 765   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 766                 overrides -T
 767   -makelist   : Rewrite the file list $file_list based on current setup
 768   -annotate   : Output an annotation for each character in the table files;
 769                 useful for debugging mktables, looking at diffs; but is slow,
 770                 memory intensive; resulting tables are usable but are slow and
 771                 very large (and currently fail the Unicode::UCD.t tests).
 772   -check A B  : Executes $0 only if A and B are the same
 773 END
 774     }
 775 }
 776
 777 # Stores the most-recently changed file.  If none have changed, can skip the
 778 # build
 779 my $most_recent = (stat $0)[9];   # Do this before the chdir!
 780
 781 # Change directories now, because need to read 'version' early.
 782 if ($use_directory) {
 783     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 784         $pod_directory = File::Spec->rel2abs($pod_directory);
 785     }
 786     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 787         $t_path = File::Spec->rel2abs($t_path);
 788     }
 789     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 790     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 791         $pod_directory = File::Spec->abs2rel($pod_directory);
 792     }
 793     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 794         $t_path = File::Spec->abs2rel($t_path);
 795     }
 796 }
 797
 798 # Get Unicode version into regular and v-string.  This is done now because
 799 # various tables below get populated based on it.  These tables are populated
 800 # here to be near the top of the file, and so easily seeable by those needing
 801 # to modify things.
 802 open my $VERSION, "<", "version"
 803                     or croak "$0: can't open required file 'version': $!\n";
 804 my $string_version = <$VERSION>;
 805 close $VERSION;
 806 chomp $string_version;
 807 my $v_version = pack "C*", split /\./, $string_version;        # v string
 808
 809 # The following are the complete names of properties with property values that
 810 # are known to not match any code points in some versions of Unicode, but that
 811 # may change in the future so they should be matchable, hence an empty file is
 812 # generated for them.
 813 my @tables_that_may_be_empty;
 814 push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
 815                                                     if $v_version lt v6.3.0;
 816 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 817 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 818 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 819                                                     if $v_version ge v4.1.0;
 820 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
 821                                                     if $v_version ge v6.0.0;
 822 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
 823                                                     if $v_version ge v6.1.0;
 824 push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
 825                                                     if $v_version ge v6.2.0;
 826
 827 # The lists below are hashes, so the key is the item in the list, and the
 828 # value is the reason why it is in the list.  This makes generation of
 829 # documentation easier.
 830
 831 my %why_suppressed;  # No file generated for these.
 832
 833 # Files aren't generated for empty extraneous properties.  This is arguable.
 834 # Extraneous properties generally come about because a property is no longer
 835 # used in a newer version of Unicode.  If we generated a file without code
 836 # points, programs that used to work on that property will still execute
 837 # without errors.  It just won't ever match (or will always match, with \P{}).
 838 # This means that the logic is now likely wrong.  I (khw) think its better to
 839 # find this out by getting an error message.  Just move them to the table
 840 # above to change this behavior
 841 my %why_suppress_if_empty_warn_if_not = (
 842
 843    # It is the only property that has ever officially been removed from the
 844    # Standard.  The database never contained any code points for it.
 845    'Special_Case_Condition' => 'Obsolete',
 846
 847    # Apparently never official, but there were code points in some versions of
 848    # old-style PropList.txt
 849    'Non_Break' => 'Obsolete',
 850 );
 851
 852 # These would normally go in the warn table just above, but they were changed
 853 # a long time before this program was written, so warnings about them are
 854 # moot.
 855 if ($v_version gt v3.2.0) {
 856     push @tables_that_may_be_empty,
 857                                 'Canonical_Combining_Class=Attached_Below_Left'
 858 }
 859
 860 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
 861 # unless explicitly added.
 862 if ($v_version ge v5.2.0) {
 863     my $unihan = 'Unihan; remove from list if using Unihan';
 864     foreach my $table (qw (
 865                            kAccountingNumeric
 866                            kOtherNumeric
 867                            kPrimaryNumeric
 868                            kCompatibilityVariant
 869                            kIICore
 870                            kIRG_GSource
 871                            kIRG_HSource
 872                            kIRG_JSource
 873                            kIRG_KPSource
 874                            kIRG_MSource
 875                            kIRG_KSource
 876                            kIRG_TSource
 877                            kIRG_USource
 878                            kIRG_VSource
 879                            kRSUnicode
 880                         ))
 881     {
 882         $why_suppress_if_empty_warn_if_not{$table} = $unihan;
 883     }
 884 }
 885
 886 # Enum values for to_output_map() method in the Map_Table package.
 887 my $EXTERNAL_MAP = 1;
 888 my $INTERNAL_MAP = 2;
 889 my $OUTPUT_ADJUSTED = 3;
 890
 891 # To override computed values for writing the map tables for these properties.
 892 # The default for enum map tables is to write them out, so that the Unicode
 893 # .txt files can be removed, but all the data to compute any property value
 894 # for any code point is available in a more compact form.
 895 my %global_to_output_map = (
 896     # Needed by UCD.pm, but don't want to publicize that it exists, so won't
 897     # get stuck supporting it if things change.  Since it is a STRING
 898     # property, it normally would be listed in the pod, but INTERNAL_MAP
 899     # suppresses that.
 900     Unicode_1_Name => $INTERNAL_MAP,
 901
 902     Present_In => 0,                # Suppress, as easily computed from Age
 903     Block => (NON_ASCII_PLATFORM) ? 1 : 0,  # Suppress, as Blocks.txt is
 904                                             # retained, but needed for
 905                                             # non-ASCII
 906
 907     # Suppress, as mapping can be found instead from the
 908     # Perl_Decomposition_Mapping file
 909     Decomposition_Type => 0,
 910 );
 911
 912 # Properties that this program ignores.
 913 my @unimplemented_properties;
 914
 915 # With this release, it is automatically handled if the Unihan db is
 916 # downloaded
 917 push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
 918
 919 # There are several types of obsolete properties defined by Unicode.  These
 920 # must be hand-edited for every new Unicode release.
 921 my %why_deprecated;  # Generates a deprecated warning message if used.
 922 my %why_stabilized;  # Documentation only
 923 my %why_obsolete;    # Documentation only
 924
 925 {   # Closure
 926     my $simple = 'Perl uses the more complete version';
 927     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 928
 929     my $other_properties = 'other properties';
 930     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 931     my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
 932
 933     %why_deprecated = (
 934         'Grapheme_Link' => 'Deprecated by Unicode:  Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
 935         'Jamo_Short_Name' => $contributory,
 936         'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 937         'Other_Alphabetic' => $contributory,
 938         'Other_Default_Ignorable_Code_Point' => $contributory,
 939         'Other_Grapheme_Extend' => $contributory,
 940         'Other_ID_Continue' => $contributory,
 941         'Other_ID_Start' => $contributory,
 942         'Other_Lowercase' => $contributory,
 943         'Other_Math' => $contributory,
 944         'Other_Uppercase' => $contributory,
 945         'Expands_On_NFC' => $why_no_expand,
 946         'Expands_On_NFD' => $why_no_expand,
 947         'Expands_On_NFKC' => $why_no_expand,
 948         'Expands_On_NFKD' => $why_no_expand,
 949     );
 950
 951     %why_suppressed = (
 952         # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
 953         # contains the same information, but without the algorithmically
 954         # determinable Hangul syllables'.  This file is not published, so it's
 955         # existence is not noted in the comment.
 956         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
 957
 958         'Indic_Matra_Category' => "Provisional",
 959         'Indic_Syllabic_Category' => "Provisional",
 960
 961         # Don't suppress ISO_Comment, as otherwise special handling is needed
 962         # to differentiate between it and gc=c, which can be written as 'isc',
 963         # which is the same characters as ISO_Comment's short name.
 964
 965         'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
 966
 967         'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
 968         'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 969         'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 970         'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 971
 972         FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
 973     );
 974
 975     foreach my $property (
 976
 977             # The following are suppressed because they were made contributory
 978             # or deprecated by Unicode before Perl ever thought about
 979             # supporting them.
 980             'Jamo_Short_Name',
 981             'Grapheme_Link',
 982             'Expands_On_NFC',
 983             'Expands_On_NFD',
 984             'Expands_On_NFKC',
 985             'Expands_On_NFKD',
 986
 987             # The following are suppressed because they have been marked
 988             # as deprecated for a sufficient amount of time
 989             'Other_Alphabetic',
 990             'Other_Default_Ignorable_Code_Point',
 991             'Other_Grapheme_Extend',
 992             'Other_ID_Continue',
 993             'Other_ID_Start',
 994             'Other_Lowercase',
 995             'Other_Math',
 996             'Other_Uppercase',
 997     ) {
 998         $why_suppressed{$property} = $why_deprecated{$property};
 999     }
1000
1001     # Customize the message for all the 'Other_' properties
1002     foreach my $property (keys %why_deprecated) {
1003         next if (my $main_property = $property) !~ s/^Other_//;
1004         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1005     }
1006 }
1007
1008 if ($write_Unicode_deprecated_tables) {
1009     foreach my $property (keys %why_suppressed) {
1010         delete $why_suppressed{$property} if $property =~
1011                                                     / ^ Other | Grapheme /x;
1012     }
1013 }
1014
1015 if ($v_version ge 4.0.0) {
1016     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1017     if ($v_version ge 6.0.0) {
1018         $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1019     }
1020 }
1021 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1022     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1023     if ($v_version ge 6.0.0) {
1024         $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1025     }
1026 }
1027
1028 # Probably obsolete forever
1029 if ($v_version ge v4.1.0) {
1030     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
1031 }
1032 if ($v_version ge v6.0.0) {
1033     $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1034     $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1035 }
1036
1037 # This program can create files for enumerated-like properties, such as
1038 # 'Numeric_Type'.  This file would be the same format as for a string
1039 # property, with a mapping from code point to its value, so you could look up,
1040 # for example, the script a code point is in.  But no one so far wants this
1041 # mapping, or they have found another way to get it since this is a new
1042 # feature.  So no file is generated except if it is in this list.
1043 my @output_mapped_properties = split "\n", <<END;
1044 END
1045
1046 # If you are using the Unihan database in a Unicode version before 5.2, you
1047 # need to add the properties that you want to extract from it to this table.
1048 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
1049 # listed, commented out
1050 my @cjk_properties = split "\n", <<'END';
1051 #cjkAccountingNumeric; kAccountingNumeric
1052 #cjkOtherNumeric; kOtherNumeric
1053 #cjkPrimaryNumeric; kPrimaryNumeric
1054 #cjkCompatibilityVariant; kCompatibilityVariant
1055 #cjkIICore ; kIICore
1056 #cjkIRG_GSource; kIRG_GSource
1057 #cjkIRG_HSource; kIRG_HSource
1058 #cjkIRG_JSource; kIRG_JSource
1059 #cjkIRG_KPSource; kIRG_KPSource
1060 #cjkIRG_KSource; kIRG_KSource
1061 #cjkIRG_TSource; kIRG_TSource
1062 #cjkIRG_USource; kIRG_USource
1063 #cjkIRG_VSource; kIRG_VSource
1064 #cjkRSUnicode; kRSUnicode                ; Unicode_Radical_Stroke; URS
1065 END
1066
1067 # Similarly for the property values.  For your convenience, the lines in the
1068 # 6.0 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
1069 # '#' marks (for Unicode versions before 5.2)
1070 my @cjk_property_values = split "\n", <<'END';
1071 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1072 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1073 ## @missing: 0000..10FFFF; cjkIICore; <none>
1074 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1075 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1076 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1077 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1078 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1079 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1080 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1081 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1082 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1083 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1084 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1085 END
1086
1087 # The input files don't list every code point.  Those not listed are to be
1088 # defaulted to some value.  Below are hard-coded what those values are for
1089 # non-binary properties as of 5.1.  Starting in 5.0, there are
1090 # machine-parsable comment lines in the files that give the defaults; so this
1091 # list shouldn't have to be extended.  The claim is that all missing entries
1092 # for binary properties will default to 'N'.  Unicode tried to change that in
1093 # 5.2, but the beta period produced enough protest that they backed off.
1094 #
1095 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1096 # be in the form that it expects.  The others may be synonyms.
1097 my $CODE_POINT = '<code point>';
1098 my %default_mapping = (
1099     Age => "Unassigned",
1100     # Bidi_Class => Complicated; set in code
1101     Bidi_Mirroring_Glyph => "",
1102     Block => 'No_Block',
1103     Canonical_Combining_Class => 0,
1104     Case_Folding => $CODE_POINT,
1105     Decomposition_Mapping => $CODE_POINT,
1106     Decomposition_Type => 'None',
1107     East_Asian_Width => "Neutral",
1108     FC_NFKC_Closure => $CODE_POINT,
1109     General_Category => 'Cn',
1110     Grapheme_Cluster_Break => 'Other',
1111     Hangul_Syllable_Type => 'NA',
1112     ISO_Comment => "",
1113     Jamo_Short_Name => "",
1114     Joining_Group => "No_Joining_Group",
1115     # Joining_Type => Complicated; set in code
1116     kIICore => 'N',   #                       Is converted to binary
1117     #Line_Break => Complicated; set in code
1118     Lowercase_Mapping => $CODE_POINT,
1119     Name => "",
1120     Name_Alias => "",
1121     NFC_QC => 'Yes',
1122     NFD_QC => 'Yes',
1123     NFKC_QC => 'Yes',
1124     NFKD_QC => 'Yes',
1125     Numeric_Type => 'None',
1126     Numeric_Value => 'NaN',
1127     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1128     Sentence_Break => 'Other',
1129     Simple_Case_Folding => $CODE_POINT,
1130     Simple_Lowercase_Mapping => $CODE_POINT,
1131     Simple_Titlecase_Mapping => $CODE_POINT,
1132     Simple_Uppercase_Mapping => $CODE_POINT,
1133     Titlecase_Mapping => $CODE_POINT,
1134     Unicode_1_Name => "",
1135     Unicode_Radical_Stroke => "",
1136     Uppercase_Mapping => $CODE_POINT,
1137     Word_Break => 'Other',
1138 );
1139
1140 # Below are files that Unicode furnishes, but this program ignores, and why.
1141 # NormalizationCorrections.txt requires some more explanation.  It documents
1142 # the cumulative fixes to erroneous normalizations in earlier Unicode
1143 # versions.  Its main purpose is so that someone running on an earlier version
1144 # can use this file to override what got published in that earlier release.
1145 # It would be easy for mktables to read and handle this file.  But all the
1146 # corrections in it should already be in the other files for the release it
1147 # is.  To get it to actually mean something useful, someone would have to be
1148 # using an earlier Unicode release, and copy it to the files for that release
1149 # and recomplile.  So far there has been no demand to do that, so this hasn't
1150 # been implemented.
1151 my %ignored_files = (
1152     'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1153     'Index.txt' => 'Alphabetical index of Unicode characters',
1154     'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1155     'NamesList.txt' => 'Annotated list of characters',
1156     'NamesList.html' => 'Describes the format and contents of F<NamesList.txt>',
1157     'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1158     'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1159     'ReadMe.txt' => 'Documentation',
1160     'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized.  This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1161     'StandardizedVariants.html' => 'Provides a visual display of the standard variant sequences derived from F<StandardizedVariants.txt>.',
1162     'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1163     'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters',
1164     'USourceGlyphs.pdf' => 'Pictures of the characters in F<USourceData.txt>',
1165     'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1166     'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1167     'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1168     'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
1169 );
1170
1171 my %skipped_files;  # List of files that we skip
1172
1173 ### End of externally interesting definitions, except for @input_file_objects
1174
1175 my $HEADER=<<"EOF";
1176 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1177 # This file is machine-generated by $0 from the Unicode
1178 # database, Version $string_version.  Any changes made here will be lost!
1179 EOF
1180
1181 my $INTERNAL_ONLY_HEADER = <<"EOF";
1182
1183 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1184 # This file is for internal use by core Perl only.  The format and even the
1185 # name or existence of this file are subject to change without notice.  Don't
1186 # use it directly.  Use Unicode::UCD to access the Unicode character data
1187 # base.
1188 EOF
1189
1190 my $DEVELOPMENT_ONLY=<<"EOF";
1191 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1192 # This file contains information artificially constrained to code points
1193 # present in Unicode release $string_compare_versions.
1194 # IT CANNOT BE RELIED ON.  It is for use during development only and should
1195 # not be used for production.
1196
1197 EOF
1198
1199 my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1200 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1201 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1202
1203 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
1204 # two must be 10; if there are 5, the first must not be a 0.  Written this way
1205 # to decrease backtracking.  The first regex allows the code point to be at
1206 # the end of a word, but to work properly, the word shouldn't end with a valid
1207 # hex character.  The second one won't match a code point at the end of a
1208 # word, and doesn't have the run-on issue
1209 my $run_on_code_point_re =
1210             qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1211 my $code_point_re = qr/\b$run_on_code_point_re/;
1212
1213 # This matches the beginning of the line in the Unicode db files that give the
1214 # defaults for code points not listed (i.e., missing) in the file.  The code
1215 # depends on this ending with a semi-colon, so it can assume it is a valid
1216 # field when the line is split() by semi-colons
1217 my $missing_defaults_prefix =
1218             qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
1219
1220 # Property types.  Unicode has more types, but these are sufficient for our
1221 # purposes.
1222 my $UNKNOWN = -1;   # initialized to illegal value
1223 my $NON_STRING = 1; # Either binary or enum
1224 my $BINARY = 2;
1225 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1226                        # tables, additional true and false tables are
1227                        # generated so that false is anything matching the
1228                        # default value, and true is everything else.
1229 my $ENUM = 4;       # Include catalog
1230 my $STRING = 5;     # Anything else: string or misc
1231
1232 # Some input files have lines that give default values for code points not
1233 # contained in the file.  Sometimes these should be ignored.
1234 my $NO_DEFAULTS = 0;        # Must evaluate to false
1235 my $NOT_IGNORED = 1;
1236 my $IGNORED = 2;
1237
1238 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
1239 # and will appear in the main body of the tables in the output files, but
1240 # there are other types of ranges as well, listed below, that are specially
1241 # handled.   There are pseudo-types as well that will never be stored as a
1242 # type, but will affect the calculation of the type.
1243
1244 # 0 is for normal, non-specials
1245 my $MULTI_CP = 1;           # Sequence of more than code point
1246 my $HANGUL_SYLLABLE = 2;
1247 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1248 my $NULL = 4;               # The map is to the null string; utf8.c can't
1249                             # handle these, nor is there an accepted syntax
1250                             # for them in \p{} constructs
1251 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1252                              # otherwise be $MULTI_CP type are instead type 0
1253
1254 # process_generic_property_file() can accept certain overrides in its input.
1255 # Each of these must begin AND end with $CMD_DELIM.
1256 my $CMD_DELIM = "\a";
1257 my $REPLACE_CMD = 'replace';    # Override the Replace
1258 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1259
1260 my $NO = 0;
1261 my $YES = 1;
1262
1263 # Values for the Replace argument to add_range.
1264 # $NO                      # Don't replace; add only the code points not
1265                            # already present.
1266 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1267                            # the comments at the subroutine definition.
1268 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1269 my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1270                            # already there
1271 my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1272                            # already there
1273 my $CROAK = 6;             # Die with an error if is already there
1274
1275 # Flags to give property statuses.  The phrases are to remind maintainers that
1276 # if the flag is changed, the indefinite article referring to it in the
1277 # documentation may need to be as well.
1278 my $NORMAL = "";
1279 my $DEPRECATED = 'D';
1280 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1281 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1282 my $DISCOURAGED = 'X';
1283 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1284 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1285 my $STRICTER = 'T';
1286 my $a_bold_stricter = "a 'B<$STRICTER>'";
1287 my $A_bold_stricter = "A 'B<$STRICTER>'";
1288 my $STABILIZED = 'S';
1289 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1290 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1291 my $OBSOLETE = 'O';
1292 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1293 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1294
1295 my %status_past_participles = (
1296     $DISCOURAGED => 'discouraged',
1297     $STABILIZED => 'stabilized',
1298     $OBSOLETE => 'obsolete',
1299     $DEPRECATED => 'deprecated',
1300 );
1301
1302 # Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1303 # externally documented.
1304 my $ORDINARY = 0;       # The normal fate.
1305 my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1306                         # but there is a file written that can be used to
1307                         # reconstruct this table
1308 my $INTERNAL_ONLY = 2;  # The file for this table is written out, but it is
1309                         # for Perl's internal use only
1310 my $LEGACY_ONLY = 3;    # Like $INTERNAL_ONLY, but not actually used by Perl.
1311                         # Is for backwards compatibility for applications that
1312                         # read the file directly, so it's format is
1313                         # unchangeable.
1314 my $SUPPRESSED = 4;     # The file for this table is not written out, and as a
1315                         # result, we don't bother to do many computations on
1316                         # it.
1317 my $PLACEHOLDER = 5;    # Like $SUPPRESSED, but we go through all the
1318                         # computations anyway, as the values are needed for
1319                         # things to work.  This happens when we have Perl
1320                         # extensions that depend on Unicode tables that
1321                         # wouldn't normally be in a given Unicode version.
1322
1323 # The format of the values of the tables:
1324 my $EMPTY_FORMAT = "";
1325 my $BINARY_FORMAT = 'b';
1326 my $DECIMAL_FORMAT = 'd';
1327 my $FLOAT_FORMAT = 'f';
1328 my $INTEGER_FORMAT = 'i';
1329 my $HEX_FORMAT = 'x';
1330 my $RATIONAL_FORMAT = 'r';
1331 my $STRING_FORMAT = 's';
1332 my $ADJUST_FORMAT = 'a';
1333 my $HEX_ADJUST_FORMAT = 'ax';
1334 my $DECOMP_STRING_FORMAT = 'c';
1335 my $STRING_WHITE_SPACE_LIST = 'sw';
1336
1337 my %map_table_formats = (
1338     $BINARY_FORMAT => 'binary',
1339     $DECIMAL_FORMAT => 'single decimal digit',
1340     $FLOAT_FORMAT => 'floating point number',
1341     $INTEGER_FORMAT => 'integer',
1342     $HEX_FORMAT => 'non-negative hex whole number; a code point',
1343     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1344     $STRING_FORMAT => 'string',
1345     $ADJUST_FORMAT => 'some entries need adjustment',
1346     $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1347     $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1348     $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1349 );
1350
1351 # Unicode didn't put such derived files in a separate directory at first.
1352 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1353 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1354 my $AUXILIARY = 'auxiliary';
1355
1356 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1357 # and into UCD.pl for the use of UCD.pm
1358 my %loose_to_file_of;       # loosely maps table names to their respective
1359                             # files
1360 my %stricter_to_file_of;    # same; but for stricter mapping.
1361 my %loose_property_to_file_of; # Maps a loose property name to its map file
1362 my %file_to_swash_name;     # Maps the file name to its corresponding key name
1363                             # in the hash %utf8::SwashInfo
1364 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1365                              # their rational equivalent
1366 my %loose_property_name_of; # Loosely maps (non_string) property names to
1367                             # standard form
1368 my %string_property_loose_to_name; # Same, for string properties.
1369 my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1370                             # the property name in standard loose form, and
1371                             # 'value' is the default value for that property,
1372                             # also in standard loose form.
1373 my %loose_to_standard_value; # loosely maps table names to the canonical
1374                             # alias for them
1375 my %ambiguous_names;        # keys are alias names (in standard form) that
1376                             # have more than one possible meaning.
1377 my %prop_aliases;           # Keys are standard property name; values are each
1378                             # one's aliases
1379 my %prop_value_aliases;     # Keys of top level are standard property name;
1380                             # values are keys to another hash,  Each one is
1381                             # one of the property's values, in standard form.
1382                             # The values are that prop-val's aliases.
1383 my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1384
1385 # Most properties are immune to caseless matching, otherwise you would get
1386 # nonsensical results, as properties are a function of a code point, not
1387 # everything that is caselessly equivalent to that code point.  For example,
1388 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1389 # be true because 's' and 'S' are equivalent caselessly.  However,
1390 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1391 # extend that concept to those very few properties that are like this.  Each
1392 # such property will match the full range caselessly.  They are hard-coded in
1393 # the program; it's not worth trying to make it general as it's extremely
1394 # unlikely that they will ever change.
1395 my %caseless_equivalent_to;
1396
1397 # These constants names and values were taken from the Unicode standard,
1398 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1399 # syllables.  The '_string' versions are so generated tables can retain the
1400 # hex format, which is the more familiar value
1401 my $SBase_string = "0xAC00";
1402 my $SBase = CORE::hex $SBase_string;
1403 my $LBase_string = "0x1100";
1404 my $LBase = CORE::hex $LBase_string;
1405 my $VBase_string = "0x1161";
1406 my $VBase = CORE::hex $VBase_string;
1407 my $TBase_string = "0x11A7";
1408 my $TBase = CORE::hex $TBase_string;
1409 my $SCount = 11172;
1410 my $LCount = 19;
1411 my $VCount = 21;
1412 my $TCount = 28;
1413 my $NCount = $VCount * $TCount;
1414
1415 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1416 # with the above published constants.
1417 my %Jamo;
1418 my %Jamo_L;     # Leading consonants
1419 my %Jamo_V;     # Vowels
1420 my %Jamo_T;     # Trailing consonants
1421
1422 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1423 # The key is the base name of the code point, and the value is an
1424 # array giving all the ranges that use this base name.  Each range
1425 # is actually a hash giving the 'low' and 'high' values of it.
1426 my %names_ending_in_code_point;
1427 my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1428                                         # removed from the names
1429 # Inverse mapping.  The list of ranges that have these kinds of
1430 # names.  Each element contains the low, high, and base names in an
1431 # anonymous hash.
1432 my @code_points_ending_in_code_point;
1433
1434 # To hold Unicode's normalization test suite
1435 my @normalization_tests;
1436
1437 # Boolean: does this Unicode version have the hangul syllables, and are we
1438 # writing out a table for them?
1439 my $has_hangul_syllables = 0;
1440
1441 # Does this Unicode version have code points whose names end in their
1442 # respective code points, and are we writing out a table for them?  0 for no;
1443 # otherwise points to first property that a table is needed for them, so that
1444 # if multiple tables are needed, we don't create duplicates
1445 my $needing_code_points_ending_in_code_point = 0;
1446
1447 my @backslash_X_tests;     # List of tests read in for testing \X
1448 my @unhandled_properties;  # Will contain a list of properties found in
1449                            # the input that we didn't process.
1450 my @match_properties;      # Properties that have match tables, to be
1451                            # listed in the pod
1452 my @map_properties;        # Properties that get map files written
1453 my @named_sequences;       # NamedSequences.txt contents.
1454 my %potential_files;       # Generated list of all .txt files in the directory
1455                            # structure so we can warn if something is being
1456                            # ignored.
1457 my @files_actually_output; # List of files we generated.
1458 my @more_Names;            # Some code point names are compound; this is used
1459                            # to store the extra components of them.
1460 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1461                            # the minimum before we consider it equivalent to a
1462                            # candidate rational
1463 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1464
1465 # These store references to certain commonly used property objects
1466 my $gc;
1467 my $perl;
1468 my $block;
1469 my $perl_charname;
1470 my $print;
1471 my $Any;
1472 my $script;
1473
1474 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1475 my $has_In_conflicts = 0;
1476 my $has_Is_conflicts = 0;
1477
1478 sub internal_file_to_platform ($) {
1479     # Convert our file paths which have '/' separators to those of the
1480     # platform.
1481
1482     my $file = shift;
1483     return undef unless defined $file;
1484
1485     return File::Spec->join(split '/', $file);
1486 }
1487
1488 sub file_exists ($) {   # platform independent '-e'.  This program internally
1489                         # uses slash as a path separator.
1490     my $file = shift;
1491     return 0 if ! defined $file;
1492     return -e internal_file_to_platform($file);
1493 }
1494
1495 sub objaddr($) {
1496     # Returns the address of the blessed input object.
1497     # It doesn't check for blessedness because that would do a string eval
1498     # every call, and the program is structured so that this is never called
1499     # for a non-blessed object.
1500
1501     no overloading; # If overloaded, numifying below won't work.
1502
1503     # Numifying a ref gives its address.
1504     return pack 'J', $_[0];
1505 }
1506
1507 # These are used only if $annotate is true.
1508 # The entire range of Unicode characters is examined to populate these
1509 # after all the input has been processed.  But most can be skipped, as they
1510 # have the same descriptive phrases, such as being unassigned
1511 my @viacode;            # Contains the 1 million character names
1512 my @printable;          # boolean: And are those characters printable?
1513 my @annotate_char_type; # Contains a type of those characters, specifically
1514                         # for the purposes of annotation.
1515 my $annotate_ranges;    # A map of ranges of code points that have the same
1516                         # name for the purposes of annotation.  They map to the
1517                         # upper edge of the range, so that the end point can
1518                         # be immediately found.  This is used to skip ahead to
1519                         # the end of a range, and avoid processing each
1520                         # individual code point in it.
1521 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1522                                    # characters, but excluding those which are
1523                                    # also noncharacter code points
1524
1525 # The annotation types are an extension of the regular range types, though
1526 # some of the latter are folded into one.  Make the new types negative to
1527 # avoid conflicting with the regular types
1528 my $SURROGATE_TYPE = -1;
1529 my $UNASSIGNED_TYPE = -2;
1530 my $PRIVATE_USE_TYPE = -3;
1531 my $NONCHARACTER_TYPE = -4;
1532 my $CONTROL_TYPE = -5;
1533 my $UNKNOWN_TYPE = -6;  # Used only if there is a bug in this program
1534
1535 sub populate_char_info ($) {
1536     # Used only with the $annotate option.  Populates the arrays with the
1537     # input code point's info that are needed for outputting more detailed
1538     # comments.  If calling context wants a return, it is the end point of
1539     # any contiguous range of characters that share essentially the same info
1540
1541     my $i = shift;
1542     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1543
1544     $viacode[$i] = $perl_charname->value_of($i) || "";
1545
1546     # A character is generally printable if Unicode says it is,
1547     # but below we make sure that most Unicode general category 'C' types
1548     # aren't.
1549     $printable[$i] = $print->contains($i);
1550
1551     $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1552
1553     # Only these two regular types are treated specially for annotations
1554     # purposes
1555     $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1556                                 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1557
1558     # Give a generic name to all code points that don't have a real name.
1559     # We output ranges, if applicable, for these.  Also calculate the end
1560     # point of the range.
1561     my $end;
1562     if (! $viacode[$i]) {
1563         my $nonchar;
1564         if ($gc-> table('Private_use')->contains($i)) {
1565             $viacode[$i] = 'Private Use';
1566             $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1567             $printable[$i] = 0;
1568             $end = $gc->table('Private_Use')->containing_range($i)->end;
1569         }
1570         elsif ((defined ($nonchar =
1571                             Property::property_ref('Noncharacter_Code_Point'))
1572                && $nonchar->table('Y')->contains($i)))
1573         {
1574             $viacode[$i] = 'Noncharacter';
1575             $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1576             $printable[$i] = 0;
1577             $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1578                                                     containing_range($i)->end;
1579         }
1580         elsif ($gc-> table('Control')->contains($i)) {
1581             $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control';
1582             $annotate_char_type[$i] = $CONTROL_TYPE;
1583             $printable[$i] = 0;
1584         }
1585         elsif ($gc-> table('Unassigned')->contains($i)) {
1586             $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1587             $printable[$i] = 0;
1588             if ($v_version lt v2.0.0) { # No blocks in earliest releases
1589                 $viacode[$i] = 'Unassigned';
1590                 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1591             }
1592             else {
1593                 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1594
1595                 # Because we name the unassigned by the blocks they are in, it
1596                 # can't go past the end of that block, and it also can't go
1597                 # past the unassigned range it is in.  The special table makes
1598                 # sure that the non-characters, which are unassigned, are
1599                 # separated out.
1600                 $end = min($block->containing_range($i)->end,
1601                            $unassigned_sans_noncharacters->
1602                                                     containing_range($i)->end);
1603             }
1604         }
1605         elsif ($v_version lt v2.0.0) {  # No surrogates in earliest releases
1606             $viacode[$i] = $gc->value_of($i);
1607             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1608             $printable[$i] = 0;
1609         }
1610         elsif ($gc-> table('Surrogate')->contains($i)) {
1611             $viacode[$i] = 'Surrogate';
1612             $annotate_char_type[$i] = $SURROGATE_TYPE;
1613             $printable[$i] = 0;
1614             $end = $gc->table('Surrogate')->containing_range($i)->end;
1615         }
1616         else {
1617             Carp::my_carp_bug("Can't figure out how to annotate "
1618                               . sprintf("U+%04X", $i)
1619                               . ".  Proceeding anyway.");
1620             $viacode[$i] = 'UNKNOWN';
1621             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1622             $printable[$i] = 0;
1623         }
1624     }
1625
1626     # Here, has a name, but if it's one in which the code point number is
1627     # appended to the name, do that.
1628     elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1629         $viacode[$i] .= sprintf("-%04X", $i);
1630         $end = $perl_charname->containing_range($i)->end;
1631     }
1632
1633     # And here, has a name, but if it's a hangul syllable one, replace it with
1634     # the correct name from the Unicode algorithm
1635     elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1636         use integer;
1637         my $SIndex = $i - $SBase;
1638         my $L = $LBase + $SIndex / $NCount;
1639         my $V = $VBase + ($SIndex % $NCount) / $TCount;
1640         my $T = $TBase + $SIndex % $TCount;
1641         $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1642         $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1643         $end = $perl_charname->containing_range($i)->end;
1644     }
1645
1646     return if ! defined wantarray;
1647     return $i if ! defined $end;    # If not a range, return the input
1648
1649     # Save this whole range so can find the end point quickly
1650     $annotate_ranges->add_map($i, $end, $end);
1651
1652     return $end;
1653 }
1654
1655 # Commented code below should work on Perl 5.8.
1656 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1657 ## the native perl version of it (which is what would operate under miniperl)
1658 ## is extremely slow, as it does a string eval every call.
1659 #my $has_fast_scalar_util = $^X !~ /miniperl/
1660 #                            && defined eval "require Scalar::Util";
1661 #
1662 #sub objaddr($) {
1663 #    # Returns the address of the blessed input object.  Uses the XS version if
1664 #    # available.  It doesn't check for blessedness because that would do a
1665 #    # string eval every call, and the program is structured so that this is
1666 #    # never called for a non-blessed object.
1667 #
1668 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1669 #
1670 #    # Check at least that is a ref.
1671 #    my $pkg = ref($_[0]) or return undef;
1672 #
1673 #    # Change to a fake package to defeat any overloaded stringify
1674 #    bless $_[0], 'main::Fake';
1675 #
1676 #    # Numifying a ref gives its address.
1677 #    my $addr = pack 'J', $_[0];
1678 #
1679 #    # Return to original class
1680 #    bless $_[0], $pkg;
1681 #    return $addr;
1682 #}
1683
1684 sub max ($$) {
1685     my $a = shift;
1686     my $b = shift;
1687     return $a if $a >= $b;
1688     return $b;
1689 }
1690
1691 sub min ($$) {
1692     my $a = shift;
1693     my $b = shift;
1694     return $a if $a <= $b;
1695     return $b;
1696 }
1697
1698 sub clarify_number ($) {
1699     # This returns the input number with underscores inserted every 3 digits
1700     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1701     # checked.
1702
1703     my $number = shift;
1704     my $pos = length($number) - 3;
1705     return $number if $pos <= 1;
1706     while ($pos > 0) {
1707         substr($number, $pos, 0) = '_';
1708         $pos -= 3;
1709     }
1710     return $number;
1711 }
1712
1713
1714 package Carp;
1715
1716 # These routines give a uniform treatment of messages in this program.  They
1717 # are placed in the Carp package to cause the stack trace to not include them,
1718 # although an alternative would be to use another package and set @CARP_NOT
1719 # for it.
1720
1721 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1722
1723 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1724 # and overload trying to load Scalar:Util under miniperl.  See
1725 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1726 undef $overload::VERSION;
1727
1728 sub my_carp {
1729     my $message = shift || "";
1730     my $nofold = shift || 0;
1731
1732     if ($message) {
1733         $message = main::join_lines($message);
1734         $message =~ s/^$0: *//;     # Remove initial program name
1735         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1736         $message = "\n$0: $message;";
1737
1738         # Fold the message with program name, semi-colon end punctuation
1739         # (which looks good with the message that carp appends to it), and a
1740         # hanging indent for continuation lines.
1741         $message = main::simple_fold($message, "", 4) unless $nofold;
1742         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1743                                     # appends is to the same line
1744     }
1745
1746     return $message if defined wantarray;   # If a caller just wants the msg
1747
1748     carp $message;
1749     return;
1750 }
1751
1752 sub my_carp_bug {
1753     # This is called when it is clear that the problem is caused by a bug in
1754     # this program.
1755
1756     my $message = shift;
1757     $message =~ s/^$0: *//;
1758     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1759     carp $message;
1760     return;
1761 }
1762
1763 sub carp_too_few_args {
1764     if (@_ != 2) {
1765         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1766         return;
1767     }
1768
1769     my $args_ref = shift;
1770     my $count = shift;
1771
1772     my_carp_bug("Need at least $count arguments to "
1773         . (caller 1)[3]
1774         . ".  Instead got: '"
1775         . join ', ', @$args_ref
1776         . "'.  No action taken.");
1777     return;
1778 }
1779
1780 sub carp_extra_args {
1781     my $args_ref = shift;
1782     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1783
1784     unless (ref $args_ref) {
1785         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1786         return;
1787     }
1788     my ($package, $file, $line) = caller;
1789     my $subroutine = (caller 1)[3];
1790
1791     my $list;
1792     if (ref $args_ref eq 'HASH') {
1793         foreach my $key (keys %$args_ref) {
1794             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1795         }
1796         $list = join ', ', each %{$args_ref};
1797     }
1798     elsif (ref $args_ref eq 'ARRAY') {
1799         foreach my $arg (@$args_ref) {
1800             $arg = $UNDEF unless defined $arg;
1801         }
1802         $list = join ', ', @$args_ref;
1803     }
1804     else {
1805         my_carp_bug("Can't cope with ref "
1806                 . ref($args_ref)
1807                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1808         return;
1809     }
1810
1811     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1812     return;
1813 }
1814
1815 package main;
1816
1817 { # Closure
1818
1819     # This program uses the inside-out method for objects, as recommended in
1820     # "Perl Best Practices".  This closure aids in generating those.  There
1821     # are two routines.  setup_package() is called once per package to set
1822     # things up, and then set_access() is called for each hash representing a
1823     # field in the object.  These routines arrange for the object to be
1824     # properly destroyed when no longer used, and for standard accessor
1825     # functions to be generated.  If you need more complex accessors, just
1826     # write your own and leave those accesses out of the call to set_access().
1827     # More details below.
1828
1829     my %constructor_fields; # fields that are to be used in constructors; see
1830                             # below
1831
1832     # The values of this hash will be the package names as keys to other
1833     # hashes containing the name of each field in the package as keys, and
1834     # references to their respective hashes as values.
1835     my %package_fields;
1836
1837     sub setup_package {
1838         # Sets up the package, creating standard DESTROY and dump methods
1839         # (unless already defined).  The dump method is used in debugging by
1840         # simple_dumper().
1841         # The optional parameters are:
1842         #   a)  a reference to a hash, that gets populated by later
1843         #       set_access() calls with one of the accesses being
1844         #       'constructor'.  The caller can then refer to this, but it is
1845         #       not otherwise used by these two routines.
1846         #   b)  a reference to a callback routine to call during destruction
1847         #       of the object, before any fields are actually destroyed
1848
1849         my %args = @_;
1850         my $constructor_ref = delete $args{'Constructor_Fields'};
1851         my $destroy_callback = delete $args{'Destroy_Callback'};
1852         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1853
1854         my %fields;
1855         my $package = (caller)[0];
1856
1857         $package_fields{$package} = \%fields;
1858         $constructor_fields{$package} = $constructor_ref;
1859
1860         unless ($package->can('DESTROY')) {
1861             my $destroy_name = "${package}::DESTROY";
1862             no strict "refs";
1863
1864             # Use typeglob to give the anonymous subroutine the name we want
1865             *$destroy_name = sub {
1866                 my $self = shift;
1867                 my $addr = do { no overloading; pack 'J', $self; };
1868
1869                 $self->$destroy_callback if $destroy_callback;
1870                 foreach my $field (keys %{$package_fields{$package}}) {
1871                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1872                     delete $package_fields{$package}{$field}{$addr};
1873                 }
1874                 return;
1875             }
1876         }
1877
1878         unless ($package->can('dump')) {
1879             my $dump_name = "${package}::dump";
1880             no strict "refs";
1881             *$dump_name = sub {
1882                 my $self = shift;
1883                 return dump_inside_out($self, $package_fields{$package}, @_);
1884             }
1885         }
1886         return;
1887     }
1888
1889     sub set_access {
1890         # Arrange for the input field to be garbage collected when no longer
1891         # needed.  Also, creates standard accessor functions for the field
1892         # based on the optional parameters-- none if none of these parameters:
1893         #   'addable'    creates an 'add_NAME()' accessor function.
1894         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1895         #                function.
1896         #   'settable'   creates a 'set_NAME()' accessor function.
1897         #   'constructor' doesn't create an accessor function, but adds the
1898         #                field to the hash that was previously passed to
1899         #                setup_package();
1900         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1901         # 'add' etc. all mean 'addable'.
1902         # The read accessor function will work on both array and scalar
1903         # values.  If another accessor in the parameter list is 'a', the read
1904         # access assumes an array.  You can also force it to be array access
1905         # by specifying 'readable_array' instead of 'readable'
1906         #
1907         # A sort-of 'protected' access can be set-up by preceding the addable,
1908         # readable or settable with some initial portion of 'protected_' (but,
1909         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1910         # "protection" is only by convention.  All that happens is that the
1911         # accessor functions' names begin with an underscore.  So instead of
1912         # calling set_foo, the call is _set_foo.  (Real protection could be
1913         # accomplished by having a new subroutine, end_package, called at the
1914         # end of each package, and then storing the __LINE__ ranges and
1915         # checking them on every accessor.  But that is way overkill.)
1916
1917         # We create anonymous subroutines as the accessors and then use
1918         # typeglobs to assign them to the proper package and name
1919
1920         my $name = shift;   # Name of the field
1921         my $field = shift;  # Reference to the inside-out hash containing the
1922                             # field
1923
1924         my $package = (caller)[0];
1925
1926         if (! exists $package_fields{$package}) {
1927             croak "$0: Must call 'setup_package' before 'set_access'";
1928         }
1929
1930         # Stash the field so DESTROY can get it.
1931         $package_fields{$package}{$name} = $field;
1932
1933         # Remaining arguments are the accessors.  For each...
1934         foreach my $access (@_) {
1935             my $access = lc $access;
1936
1937             my $protected = "";
1938
1939             # Match the input as far as it goes.
1940             if ($access =~ /^(p[^_]*)_/) {
1941                 $protected = $1;
1942                 if (substr('protected_', 0, length $protected)
1943                     eq $protected)
1944                 {
1945
1946                     # Add 1 for the underscore not included in $protected
1947                     $access = substr($access, length($protected) + 1);
1948                     $protected = '_';
1949                 }
1950                 else {
1951                     $protected = "";
1952                 }
1953             }
1954
1955             if (substr('addable', 0, length $access) eq $access) {
1956                 my $subname = "${package}::${protected}add_$name";
1957                 no strict "refs";
1958
1959                 # add_ accessor.  Don't add if already there, which we
1960                 # determine using 'eq' for scalars and '==' otherwise.
1961                 *$subname = sub {
1962                     use strict "refs";
1963                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1964                     my $self = shift;
1965                     my $value = shift;
1966                     my $addr = do { no overloading; pack 'J', $self; };
1967                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1968                     if (ref $value) {
1969                         return if grep { $value == $_ } @{$field->{$addr}};
1970                     }
1971                     else {
1972                         return if grep { $value eq $_ } @{$field->{$addr}};
1973                     }
1974                     push @{$field->{$addr}}, $value;
1975                     return;
1976                 }
1977             }
1978             elsif (substr('constructor', 0, length $access) eq $access) {
1979                 if ($protected) {
1980                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
1981                 }
1982                 else {
1983                     $constructor_fields{$package}{$name} = $field;
1984                 }
1985             }
1986             elsif (substr('readable_array', 0, length $access) eq $access) {
1987
1988                 # Here has read access.  If one of the other parameters for
1989                 # access is array, or this one specifies array (by being more
1990                 # than just 'readable_'), then create a subroutine that
1991                 # assumes the data is an array.  Otherwise just a scalar
1992                 my $subname = "${package}::${protected}$name";
1993                 if (grep { /^a/i } @_
1994                     or length($access) > length('readable_'))
1995                 {
1996                     no strict "refs";
1997                     *$subname = sub {
1998                         use strict "refs";
1999                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2000                         my $addr = do { no overloading; pack 'J', $_[0]; };
2001                         if (ref $field->{$addr} ne 'ARRAY') {
2002                             my $type = ref $field->{$addr};
2003                             $type = 'scalar' unless $type;
2004                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
2005                             return;
2006                         }
2007                         return scalar @{$field->{$addr}} unless wantarray;
2008
2009                         # Make a copy; had problems with caller modifying the
2010                         # original otherwise
2011                         my @return = @{$field->{$addr}};
2012                         return @return;
2013                     }
2014                 }
2015                 else {
2016
2017                     # Here not an array value, a simpler function.
2018                     no strict "refs";
2019                     *$subname = sub {
2020                         use strict "refs";
2021                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2022                         no overloading;
2023                         return $field->{pack 'J', $_[0]};
2024                     }
2025                 }
2026             }
2027             elsif (substr('settable', 0, length $access) eq $access) {
2028                 my $subname = "${package}::${protected}set_$name";
2029                 no strict "refs";
2030                 *$subname = sub {
2031                     use strict "refs";
2032                     if (main::DEBUG) {
2033                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2034                         Carp::carp_extra_args(\@_) if @_ > 2;
2035                     }
2036                     # $self is $_[0]; $value is $_[1]
2037                     no overloading;
2038                     $field->{pack 'J', $_[0]} = $_[1];
2039                     return;
2040                 }
2041             }
2042             else {
2043                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
2044             }
2045         }
2046         return;
2047     }
2048 }
2049
2050 package Input_file;
2051
2052 # All input files use this object, which stores various attributes about them,
2053 # and provides for convenient, uniform handling.  The run method wraps the
2054 # processing.  It handles all the bookkeeping of opening, reading, and closing
2055 # the file, returning only significant input lines.
2056 #
2057 # Each object gets a handler which processes the body of the file, and is
2058 # called by run().  All character property files must use the generic,
2059 # default handler, which has code scrubbed to handle things you might not
2060 # expect, including automatic EBCDIC handling.  For files that don't deal with
2061 # mapping code points to a property value, such as test files,
2062 # PropertyAliases, PropValueAliases, and named sequences, you can override the
2063 # handler to be a custom one.  Such a handler should basically be a
2064 # while(next_line()) {...} loop.
2065 #
2066 # You can also set up handlers to
2067 #   1) call before the first line is read, for pre processing
2068 #   2) call to adjust each line of the input before the main handler gets
2069 #      them.  This can be automatically generated, if appropriately simple
2070 #      enough, by specifiying a Properties parameter in the constructor.
2071 #   3) call upon EOF before the main handler exits its loop
2072 #   4) call at the end, for post processing
2073 #
2074 # $_ is used to store the input line, and is to be filtered by the
2075 # each_line_handler()s.  So, if the format of the line is not in the desired
2076 # format for the main handler, these are used to do that adjusting.  They can
2077 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2078 # so the $_ output of one is used as the input to the next.  None of the other
2079 # handlers are stackable, but could easily be changed to be so.
2080 #
2081 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2082 # which insert the parameters as lines to be processed before the next input
2083 # file line is read.  This allows the EOF handler to flush buffers, for
2084 # example.  The difference between the two routines is that the lines inserted
2085 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
2086 # called it from such a handler, you would get infinite recursion.)  Lines
2087 # inserted by insert_adjusted_lines() go directly to the main handler without
2088 # any adjustments.  If the  post-processing handler calls any of these, there
2089 # will be no effect.  Some error checking for these conditions could be added,
2090 # but it hasn't been done.
2091 #
2092 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2093 # to prevent further processing of the line.  This routine will output the
2094 # message as a warning once, and then keep a count of the lines that have the
2095 # same message, and output that count at the end of the file's processing.
2096 # This keeps the number of messages down to a manageable amount.
2097 #
2098 # get_missings() should be called to retrieve any @missing input lines.
2099 # Messages will be raised if this isn't done if the options aren't to ignore
2100 # missings.
2101
2102 sub trace { return main::trace(@_); }
2103
2104 { # Closure
2105     # Keep track of fields that are to be put into the constructor.
2106     my %constructor_fields;
2107
2108     main::setup_package(Constructor_Fields => \%constructor_fields);
2109
2110     my %file; # Input file name, required
2111     main::set_access('file', \%file, qw{ c r });
2112
2113     my %first_released; # Unicode version file was first released in, required
2114     main::set_access('first_released', \%first_released, qw{ c r });
2115
2116     my %handler;    # Subroutine to process the input file, defaults to
2117                     # 'process_generic_property_file'
2118     main::set_access('handler', \%handler, qw{ c });
2119
2120     my %property;
2121     # name of property this file is for.  defaults to none, meaning not
2122     # applicable, or is otherwise determinable, for example, from each line.
2123     main::set_access('property', \%property, qw{ c r });
2124
2125     my %optional;
2126     # If this is true, the file is optional.  If not present, no warning is
2127     # output.  If it is present, the string given by this parameter is
2128     # evaluated, and if false the file is not processed.
2129     main::set_access('optional', \%optional, 'c', 'r');
2130
2131     my %non_skip;
2132     # This is used for debugging, to skip processing of all but a few input
2133     # files.  Add 'non_skip => 1' to the constructor for those files you want
2134     # processed when you set the $debug_skip global.
2135     main::set_access('non_skip', \%non_skip, 'c');
2136
2137     my %skip;
2138     # This is used to skip processing of this input file semi-permanently,
2139     # when it evaluates to true.  The value should be the reason the file is
2140     # being skipped.  It is used for files that we aren't planning to process
2141     # anytime soon, but want to allow to be in the directory and not raise a
2142     # message that we are not handling.  Mostly for test files.  This is in
2143     # contrast to the non_skip element, which is supposed to be used very
2144     # temporarily for debugging.  Sets 'optional' to 1.  Also, files that we
2145     # pretty much will never look at can be placed in the global
2146     # %ignored_files instead.  Ones used here will be added to %skipped files
2147     main::set_access('skip', \%skip, 'c');
2148
2149     my %each_line_handler;
2150     # list of subroutines to look at and filter each non-comment line in the
2151     # file.  defaults to none.  The subroutines are called in order, each is
2152     # to adjust $_ for the next one, and the final one adjusts it for
2153     # 'handler'
2154     main::set_access('each_line_handler', \%each_line_handler, 'c');
2155
2156     my %properties; # Optional ordered list of the properties that occur in each
2157     # meaningful line of the input file.  If present, an appropriate
2158     # each_line_handler() is automatically generated and pushed onto the stack
2159     # of such handlers.  This is useful when a file contains multiple
2160     # proerties per line, but no other special considerations are necessary.
2161     # The special value "<ignored>" means to discard the corresponding input
2162     # field.
2163     # Any @missing lines in the file should also match this syntax; no such
2164     # files exist as of 6.3.  But if it happens in a future release, the code
2165     # could be expanded to properly parse them.
2166     main::set_access('properties', \%properties, qw{ c r });
2167
2168     my %has_missings_defaults;
2169     # ? Are there lines in the file giving default values for code points
2170     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2171     # the norm, but IGNORED means it has such lines, but the handler doesn't
2172     # use them.  Having these three states allows us to catch changes to the
2173     # UCD that this program should track.  XXX This could be expanded to
2174     # specify the syntax for such lines, like %properties above.
2175     main::set_access('has_missings_defaults',
2176                                         \%has_missings_defaults, qw{ c r });
2177
2178     my %pre_handler;
2179     # Subroutine to call before doing anything else in the file.  If undef, no
2180     # such handler is called.
2181     main::set_access('pre_handler', \%pre_handler, qw{ c });
2182
2183     my %eof_handler;
2184     # Subroutine to call upon getting an EOF on the input file, but before
2185     # that is returned to the main handler.  This is to allow buffers to be
2186     # flushed.  The handler is expected to call insert_lines() or
2187     # insert_adjusted() with the buffered material
2188     main::set_access('eof_handler', \%eof_handler, qw{ c r });
2189
2190     my %post_handler;
2191     # Subroutine to call after all the lines of the file are read in and
2192     # processed.  If undef, no such handler is called.
2193     main::set_access('post_handler', \%post_handler, qw{ c });
2194
2195     my %progress_message;
2196     # Message to print to display progress in lieu of the standard one
2197     main::set_access('progress_message', \%progress_message, qw{ c });
2198
2199     my %handle;
2200     # cache open file handle, internal.  Is undef if file hasn't been
2201     # processed at all, empty if has;
2202     main::set_access('handle', \%handle);
2203
2204     my %added_lines;
2205     # cache of lines added virtually to the file, internal
2206     main::set_access('added_lines', \%added_lines);
2207
2208     my %remapped_lines;
2209     # cache of lines added virtually to the file, internal
2210     main::set_access('remapped_lines', \%remapped_lines);
2211
2212     my %errors;
2213     # cache of errors found, internal
2214     main::set_access('errors', \%errors);
2215
2216     my %missings;
2217     # storage of '@missing' defaults lines
2218     main::set_access('missings', \%missings);
2219
2220     sub _next_line;
2221     sub _next_line_with_remapped_range;
2222
2223     sub new {
2224         my $class = shift;
2225
2226         my $self = bless \do{ my $anonymous_scalar }, $class;
2227         my $addr = do { no overloading; pack 'J', $self; };
2228
2229         # Set defaults
2230         $handler{$addr} = \&main::process_generic_property_file;
2231         $non_skip{$addr} = 0;
2232         $skip{$addr} = 0;
2233         $has_missings_defaults{$addr} = $NO_DEFAULTS;
2234         $handle{$addr} = undef;
2235         $added_lines{$addr} = [ ];
2236         $remapped_lines{$addr} = [ ];
2237         $each_line_handler{$addr} = [ ];
2238         $errors{$addr} = { };
2239         $missings{$addr} = [ ];
2240
2241         # Two positional parameters.
2242         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2243         $file{$addr} = main::internal_file_to_platform(shift);
2244         $first_released{$addr} = shift;
2245
2246         # The rest of the arguments are key => value pairs
2247         # %constructor_fields has been set up earlier to list all possible
2248         # ones.  Either set or push, depending on how the default has been set
2249         # up just above.
2250         my %args = @_;
2251         foreach my $key (keys %args) {
2252             my $argument = $args{$key};
2253
2254             # Note that the fields are the lower case of the constructor keys
2255             my $hash = $constructor_fields{lc $key};
2256             if (! defined $hash) {
2257                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2258                 next;
2259             }
2260             if (ref $hash->{$addr} eq 'ARRAY') {
2261                 if (ref $argument eq 'ARRAY') {
2262                     foreach my $argument (@{$argument}) {
2263                         next if ! defined $argument;
2264                         push @{$hash->{$addr}}, $argument;
2265                     }
2266                 }
2267                 else {
2268                     push @{$hash->{$addr}}, $argument if defined $argument;
2269                 }
2270             }
2271             else {
2272                 $hash->{$addr} = $argument;
2273             }
2274             delete $args{$key};
2275         };
2276
2277         # If the file has a property for it, it means that the property is not
2278         # listed in the file's entries.  So add a handler to the list of line
2279         # handlers to insert the property name into the lines, to provide a
2280         # uniform interface to the final processing subroutine.
2281         # the final code doesn't have to worry about that.
2282         if ($property{$addr}) {
2283             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2284         }
2285
2286         if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2287             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2288         }
2289
2290         # If skipping, set to optional, and add to list of ignored files,
2291         # including its reason
2292         if ($skip{$addr}) {
2293             $optional{$addr} = 1;
2294             $skipped_files{$file{$addr}} = $skip{$addr}
2295         }
2296         elsif ($properties{$addr}) {
2297
2298             # Add a handler for each line in the input so that it creates a
2299             # separate input line for each property in those input lines, thus
2300             # making them suitable for process_generic_property_file().
2301
2302             push @{$each_line_handler{$addr}},
2303                  sub {
2304                     my $file = shift;
2305                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2306
2307                     my @fields = split /\s*;\s*/, $_, -1;
2308
2309                     if (@fields - 1 > @{$properties{$addr}}) {
2310                         $file->carp_bad_line('Extra fields');
2311                         $_ = "";
2312                         return;
2313                     }
2314                     my $range = shift @fields;  # 0th element is always the
2315                                                 # range
2316
2317                     # The next fields in the input line correspond
2318                     # respectively to the stored properties.
2319                     for my $i (0 ..  @{$properties{$addr}} - 1) {
2320                         my $property_name = $properties{$addr}[$i];
2321                         next if $property_name eq '<ignored>';
2322                         $file->insert_adjusted_lines(
2323                               "$range; $property_name; $fields[$i]");
2324                     }
2325                     $_ = "";
2326
2327                     return;
2328                 };
2329         }
2330
2331         {   # On non-ascii platforms, we use a special handler
2332             no strict;
2333             no warnings 'once';
2334             *next_line = (main::NON_ASCII_PLATFORM)
2335                          ? *_next_line_with_remapped_range
2336                          : *_next_line;
2337         }
2338
2339         return $self;
2340     }
2341
2342
2343     use overload
2344         fallback => 0,
2345         qw("") => "_operator_stringify",
2346         "." => \&main::_operator_dot,
2347         ".=" => \&main::_operator_dot_equal,
2348     ;
2349
2350     sub _operator_stringify {
2351         my $self = shift;
2352
2353         return __PACKAGE__ . " object for " . $self->file;
2354     }
2355
2356     # flag to make sure extracted files are processed early
2357     my $seen_non_extracted_non_age = 0;
2358
2359     sub run {
2360         # Process the input object $self.  This opens and closes the file and
2361         # calls all the handlers for it.  Currently,  this can only be called
2362         # once per file, as it destroy's the EOF handler
2363
2364         my $self = shift;
2365         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2366
2367         my $addr = do { no overloading; pack 'J', $self; };
2368
2369         my $file = $file{$addr};
2370
2371         # Don't process if not expecting this file (because released later
2372         # than this Unicode version), and isn't there.  This means if someone
2373         # copies it into an earlier version's directory, we will go ahead and
2374         # process it.
2375         return if $first_released{$addr} gt $v_version && ! -e $file;
2376
2377         # If in debugging mode and this file doesn't have the non-skip
2378         # flag set, and isn't one of the critical files, skip it.
2379         if ($debug_skip
2380             && $first_released{$addr} ne v0
2381             && ! $non_skip{$addr})
2382         {
2383             print "Skipping $file in debugging\n" if $verbosity;
2384             return;
2385         }
2386
2387         # File could be optional
2388         if ($optional{$addr}) {
2389             return unless -e $file;
2390             my $result = eval $optional{$addr};
2391             if (! defined $result) {
2392                 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}.  $file Skipped.");
2393                 return;
2394             }
2395             if (! $result) {
2396                 if ($verbosity) {
2397                     print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2398                 }
2399                 return;
2400             }
2401         }
2402
2403         if (! defined $file || ! -e $file) {
2404
2405             # If the file doesn't exist, see if have internal data for it
2406             # (based on first_released being 0).
2407             if ($first_released{$addr} eq v0) {
2408                 $handle{$addr} = 'pretend_is_open';
2409             }
2410             else {
2411                 if (! $optional{$addr}  # File could be optional
2412                     && $v_version ge $first_released{$addr})
2413                 {
2414                     print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2415                 }
2416                 return;
2417             }
2418         }
2419         else {
2420
2421             # Here, the file exists.  Some platforms may change the case of
2422             # its name
2423             if ($seen_non_extracted_non_age) {
2424                 if ($file =~ /$EXTRACTED/i) {
2425                     Carp::my_carp_bug(main::join_lines(<<END
2426 $file should be processed just after the 'Prop...Alias' files, and before
2427 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2428 have subtle problems
2429 END
2430                     ));
2431                 }
2432             }
2433             elsif ($EXTRACTED_DIR
2434                     && $first_released{$addr} ne v0
2435                     && $file !~ /$EXTRACTED/i
2436                     && lc($file) ne 'dage.txt')
2437             {
2438                 # We don't set this (by the 'if' above) if we have no
2439                 # extracted directory, so if running on an early version,
2440                 # this test won't work.  Not worth worrying about.
2441                 $seen_non_extracted_non_age = 1;
2442             }
2443
2444             # And mark the file as having being processed, and warn if it
2445             # isn't a file we are expecting.  As we process the files,
2446             # they are deleted from the hash, so any that remain at the
2447             # end of the program are files that we didn't process.
2448             my $fkey = File::Spec->rel2abs($file);
2449             my $expecting = delete $potential_files{lc($fkey)};
2450
2451             Carp::my_carp("Was not expecting '$file'.") if
2452                     ! $expecting
2453                     && ! defined $handle{$addr};
2454
2455             # Having deleted from expected files, we can quit if not to do
2456             # anything.  Don't print progress unless really want verbosity
2457             if ($skip{$addr}) {
2458                 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2459                 return;
2460             }
2461
2462             # Open the file, converting the slashes used in this program
2463             # into the proper form for the OS
2464             my $file_handle;
2465             if (not open $file_handle, "<", $file) {
2466                 Carp::my_carp("Can't open $file.  Skipping: $!");
2467                 return 0;
2468             }
2469             $handle{$addr} = $file_handle; # Cache the open file handle
2470
2471             if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
2472                 $_ = <$file_handle>;
2473                 if ($_ !~ / - $string_version \. /x) {
2474                     chomp;
2475                     $_ =~ s/^#\s*//;
2476                     die Carp::my_carp("File '$file' is version '$_'.  It should be version $string_version");
2477                 }
2478             }
2479         }
2480
2481         if ($verbosity >= $PROGRESS) {
2482             if ($progress_message{$addr}) {
2483                 print "$progress_message{$addr}\n";
2484             }
2485             else {
2486                 # If using a virtual file, say so.
2487                 print "Processing ", (-e $file)
2488                                        ? $file
2489                                        : "substitute $file",
2490                                      "\n";
2491             }
2492         }
2493
2494
2495         # Call any special handler for before the file.
2496         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2497
2498         # Then the main handler
2499         &{$handler{$addr}}($self);
2500
2501         # Then any special post-file handler.
2502         &{$post_handler{$addr}}($self) if $post_handler{$addr};
2503
2504         # If any errors have been accumulated, output the counts (as the first
2505         # error message in each class was output when it was encountered).
2506         if ($errors{$addr}) {
2507             my $total = 0;
2508             my $types = 0;
2509             foreach my $error (keys %{$errors{$addr}}) {
2510                 $total += $errors{$addr}->{$error};
2511                 delete $errors{$addr}->{$error};
2512                 $types++;
2513             }
2514             if ($total > 1) {
2515                 my $message
2516                         = "A total of $total lines had errors in $file.  ";
2517
2518                 $message .= ($types == 1)
2519                             ? '(Only the first one was displayed.)'
2520                             : '(Only the first of each type was displayed.)';
2521                 Carp::my_carp($message);
2522             }
2523         }
2524
2525         if (@{$missings{$addr}}) {
2526             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2527         }
2528
2529         # If a real file handle, close it.
2530         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2531                                                         ref $handle{$addr};
2532         $handle{$addr} = "";   # Uses empty to indicate that has already seen
2533                                # the file, as opposed to undef
2534         return;
2535     }
2536
2537     sub _next_line {
2538         # Sets $_ to be the next logical input line, if any.  Returns non-zero
2539         # if such a line exists.  'logical' means that any lines that have
2540         # been added via insert_lines() will be returned in $_ before the file
2541         # is read again.
2542
2543         my $self = shift;
2544         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2545
2546         my $addr = do { no overloading; pack 'J', $self; };
2547
2548         # Here the file is open (or if the handle is not a ref, is an open
2549         # 'virtual' file).  Get the next line; any inserted lines get priority
2550         # over the file itself.
2551         my $adjusted;
2552
2553         LINE:
2554         while (1) { # Loop until find non-comment, non-empty line
2555             #local $to_trace = 1 if main::DEBUG;
2556             my $inserted_ref = shift @{$added_lines{$addr}};
2557             if (defined $inserted_ref) {
2558                 ($adjusted, $_) = @{$inserted_ref};
2559                 trace $adjusted, $_ if main::DEBUG && $to_trace;
2560                 return 1 if $adjusted;
2561             }
2562             else {
2563                 last if ! ref $handle{$addr}; # Don't read unless is real file
2564                 last if ! defined ($_ = readline $handle{$addr});
2565             }
2566             chomp;
2567             trace $_ if main::DEBUG && $to_trace;
2568
2569             # See if this line is the comment line that defines what property
2570             # value that code points that are not listed in the file should
2571             # have.  The format or existence of these lines is not guaranteed
2572             # by Unicode since they are comments, but the documentation says
2573             # that this was added for machine-readability, so probably won't
2574             # change.  This works starting in Unicode Version 5.0.  They look
2575             # like:
2576             #
2577             # @missing: 0000..10FFFF; Not_Reordered
2578             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2579             # @missing: 0000..10FFFF; ; NaN
2580             #
2581             # Save the line for a later get_missings() call.
2582             if (/$missing_defaults_prefix/) {
2583                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2584                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2585                 }
2586                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2587                     my @defaults = split /\s* ; \s*/x, $_;
2588
2589                     # The first field is the @missing, which ends in a
2590                     # semi-colon, so can safely shift.
2591                     shift @defaults;
2592
2593                     # Some of these lines may have empty field placeholders
2594                     # which get in the way.  An example is:
2595                     # @missing: 0000..10FFFF; ; NaN
2596                     # Remove them.  Process starting from the top so the
2597                     # splice doesn't affect things still to be looked at.
2598                     for (my $i = @defaults - 1; $i >= 0; $i--) {
2599                         next if $defaults[$i] ne "";
2600                         splice @defaults, $i, 1;
2601                     }
2602
2603                     # What's left should be just the property (maybe) and the
2604                     # default.  Having only one element means it doesn't have
2605                     # the property.
2606                     my $default;
2607                     my $property;
2608                     if (@defaults >= 1) {
2609                         if (@defaults == 1) {
2610                             $default = $defaults[0];
2611                         }
2612                         else {
2613                             $property = $defaults[0];
2614                             $default = $defaults[1];
2615                         }
2616                     }
2617
2618                     if (@defaults < 1
2619                         || @defaults > 2
2620                         || ($default =~ /^</
2621                             && $default !~ /^<code *point>$/i
2622                             && $default !~ /^<none>$/i
2623                             && $default !~ /^<script>$/i))
2624                     {
2625                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2626                     }
2627                     else {
2628
2629                         # If the property is missing from the line, it should
2630                         # be the one for the whole file
2631                         $property = $property{$addr} if ! defined $property;
2632
2633                         # Change <none> to the null string, which is what it
2634                         # really means.  If the default is the code point
2635                         # itself, set it to <code point>, which is what
2636                         # Unicode uses (but sometimes they've forgotten the
2637                         # space)
2638                         if ($default =~ /^<none>$/i) {
2639                             $default = "";
2640                         }
2641                         elsif ($default =~ /^<code *point>$/i) {
2642                             $default = $CODE_POINT;
2643                         }
2644                         elsif ($default =~ /^<script>$/i) {
2645
2646                             # Special case this one.  Currently is from
2647                             # ScriptExtensions.txt, and means for all unlisted
2648                             # code points, use their Script property values.
2649                             # For the code points not listed in that file, the
2650                             # default value is 'Unknown'.
2651                             $default = "Unknown";
2652                         }
2653
2654                         # Store them as a sub-arrays with both components.
2655                         push @{$missings{$addr}}, [ $default, $property ];
2656                     }
2657                 }
2658
2659                 # There is nothing for the caller to process on this comment
2660                 # line.
2661                 next;
2662             }
2663
2664             # Remove comments and trailing space, and skip this line if the
2665             # result is empty
2666             s/#.*//;
2667             s/\s+$//;
2668             next if /^$/;
2669
2670             # Call any handlers for this line, and skip further processing of
2671             # the line if the handler sets the line to null.
2672             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2673                 &{$sub_ref}($self);
2674                 next LINE if /^$/;
2675             }
2676
2677             # Here the line is ok.  return success.
2678             return 1;
2679         } # End of looping through lines.
2680
2681         # If there is an EOF handler, call it (only once) and if it generates
2682         # more lines to process go back in the loop to handle them.
2683         if ($eof_handler{$addr}) {
2684             &{$eof_handler{$addr}}($self);
2685             $eof_handler{$addr} = "";   # Currently only get one shot at it.
2686             goto LINE if $added_lines{$addr};
2687         }
2688
2689         # Return failure -- no more lines.
2690         return 0;
2691
2692     }
2693
2694     sub _next_line_with_remapped_range {
2695         my $self = shift;
2696         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2697
2698         # like _next_line(), but for use on non-ASCII platforms.  It sets $_
2699         # to be the next logical input line, if any.  Returns non-zero if such
2700         # a line exists.  'logical' means that any lines that have been added
2701         # via insert_lines() will be returned in $_ before the file is read
2702         # again.
2703         #
2704         # The difference from _next_line() is that this remaps the Unicode
2705         # code points in the input to those of the native platform.  Each
2706         # input line contains a single code point, or a single contiguous
2707         # range of them  This routine splits each range into its individual
2708         # code points and caches them.  It returns the cached values,
2709         # translated into their native equivalents, one at a time, for each
2710         # call, before reading the next line.  Since native values can only be
2711         # a single byte wide, no translation is needed for code points above
2712         # 0xFF, and ranges that are entirely above that number are not split.
2713         # If an input line contains the range 254-1000, it would be split into
2714         # three elements: 254, 255, and 256-1000.  (The downstream table
2715         # insertion code will sort and coalesce the individual code points
2716         # into appropriate ranges.)
2717
2718         my $addr = do { no overloading; pack 'J', $self; };
2719
2720         while (1) {
2721
2722             # Look in cache before reading the next line.  Return any cached
2723             # value, translated
2724             my $inserted = shift @{$remapped_lines{$addr}};
2725             if (defined $inserted) {
2726                 trace $inserted if main::DEBUG && $to_trace;
2727                 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
2728                 trace $_ if main::DEBUG && $to_trace;
2729                 return 1;
2730             }
2731
2732             # Get the next line.
2733             return 0 unless _next_line($self);
2734
2735             # If there is a special handler for it, return the line,
2736             # untranslated.  This should happen only for files that are
2737             # special, not being code-point related, such as property names.
2738             return 1 if $handler{$addr}
2739                                     != \&main::process_generic_property_file;
2740
2741             my ($range, $property_name, $map, @remainder)
2742                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
2743
2744             if (@remainder
2745                 || ! defined $property_name
2746                 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
2747             {
2748                 Carp::my_carp_bug("Unrecognized input line '$_'.  Ignored");
2749             }
2750
2751             my $low = hex $1;
2752             my $high = (defined $2) ? hex $2 : $low;
2753
2754             # If the input maps the range to another code point, remap the
2755             # target if it is between 0 and 255.
2756             my $tail;
2757             if (defined $map) {
2758                 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
2759                 $tail = "$property_name; $map";
2760                 $_ = "$range; $tail";
2761             }
2762             else {
2763                 $tail = $property_name;
2764             }
2765
2766             # If entire range is above 255, just return it, unchanged (except
2767             # any mapped-to code point, already changed above)
2768             return 1 if $low > 255;
2769
2770             # Cache an entry for every code point < 255.  For those in the
2771             # range above 255, return a dummy entry for just that portion of
2772             # the range.  Note that this will be out-of-order, but that is not
2773             # a problem.
2774             foreach my $code_point ($low .. $high) {
2775                 if ($code_point > 255) {
2776                     $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
2777                     return 1;
2778                 }
2779                 push @{$remapped_lines{$addr}}, "$code_point; $tail";
2780             }
2781         } # End of looping through lines.
2782
2783         # NOTREACHED
2784     }
2785
2786 #   Not currently used, not fully tested.
2787 #    sub peek {
2788 #        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2789 #        # record.  Not callable from an each_line_handler(), nor does it call
2790 #        # an each_line_handler() on the line.
2791 #
2792 #        my $self = shift;
2793 #        my $addr = do { no overloading; pack 'J', $self; };
2794 #
2795 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
2796 #            my ($adjusted, $line) = @{$inserted_ref};
2797 #            next if $adjusted;
2798 #
2799 #            # Remove comments and trailing space, and return a non-empty
2800 #            # resulting line
2801 #            $line =~ s/#.*//;
2802 #            $line =~ s/\s+$//;
2803 #            return $line if $line ne "";
2804 #        }
2805 #
2806 #        return if ! ref $handle{$addr}; # Don't read unless is real file
2807 #        while (1) { # Loop until find non-comment, non-empty line
2808 #            local $to_trace = 1 if main::DEBUG;
2809 #            trace $_ if main::DEBUG && $to_trace;
2810 #            return if ! defined (my $line = readline $handle{$addr});
2811 #            chomp $line;
2812 #            push @{$added_lines{$addr}}, [ 0, $line ];
2813 #
2814 #            $line =~ s/#.*//;
2815 #            $line =~ s/\s+$//;
2816 #            return $line if $line ne "";
2817 #        }
2818 #
2819 #        return;
2820 #    }
2821
2822
2823     sub insert_lines {
2824         # Lines can be inserted so that it looks like they were in the input
2825         # file at the place it was when this routine is called.  See also
2826         # insert_adjusted_lines().  Lines inserted via this routine go through
2827         # any each_line_handler()
2828
2829         my $self = shift;
2830
2831         # Each inserted line is an array, with the first element being 0 to
2832         # indicate that this line hasn't been adjusted, and needs to be
2833         # processed.
2834         no overloading;
2835         push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2836         return;
2837     }
2838
2839     sub insert_adjusted_lines {
2840         # Lines can be inserted so that it looks like they were in the input
2841         # file at the place it was when this routine is called.  See also
2842         # insert_lines().  Lines inserted via this routine are already fully
2843         # adjusted, ready to be processed; each_line_handler()s handlers will
2844         # not be called.  This means this is not a completely general
2845         # facility, as only the last each_line_handler on the stack should
2846         # call this.  It could be made more general, by passing to each of the
2847         # line_handlers their position on the stack, which they would pass on
2848         # to this routine, and that would replace the boolean first element in
2849         # the anonymous array pushed here, so that the next_line routine could
2850         # use that to call only those handlers whose index is after it on the
2851         # stack.  But this is overkill for what is needed now.
2852
2853         my $self = shift;
2854         trace $_[0] if main::DEBUG && $to_trace;
2855
2856         # Each inserted line is an array, with the first element being 1 to
2857         # indicate that this line has been adjusted
2858         no overloading;
2859         push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2860         return;
2861     }
2862
2863     sub get_missings {
2864         # Returns the stored up @missings lines' values, and clears the list.
2865         # The values are in an array, consisting of the default in the first
2866         # element, and the property in the 2nd.  However, since these lines
2867         # can be stacked up, the return is an array of all these arrays.
2868
2869         my $self = shift;
2870         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2871
2872         my $addr = do { no overloading; pack 'J', $self; };
2873
2874         # If not accepting a list return, just return the first one.
2875         return shift @{$missings{$addr}} unless wantarray;
2876
2877         my @return = @{$missings{$addr}};
2878         undef @{$missings{$addr}};
2879         return @return;
2880     }
2881
2882     sub _insert_property_into_line {
2883         # Add a property field to $_, if this file requires it.
2884
2885         my $self = shift;
2886         my $addr = do { no overloading; pack 'J', $self; };
2887         my $property = $property{$addr};
2888         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2889
2890         $_ =~ s/(;|$)/; $property$1/;
2891         return;
2892     }
2893
2894     sub carp_bad_line {
2895         # Output consistent error messages, using either a generic one, or the
2896         # one given by the optional parameter.  To avoid gazillions of the
2897         # same message in case the syntax of a  file is way off, this routine
2898         # only outputs the first instance of each message, incrementing a
2899         # count so the totals can be output at the end of the file.
2900
2901         my $self = shift;
2902         my $message = shift;
2903         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2904
2905         my $addr = do { no overloading; pack 'J', $self; };
2906
2907         $message = 'Unexpected line' unless $message;
2908
2909         # No trailing punctuation so as to fit with our addenda.
2910         $message =~ s/[.:;,]$//;
2911
2912         # If haven't seen this exact message before, output it now.  Otherwise
2913         # increment the count of how many times it has occurred
2914         unless ($errors{$addr}->{$message}) {
2915             Carp::my_carp("$message in '$_' in "
2916                             . $file{$addr}
2917                             . " at line $..  Skipping this line;");
2918             $errors{$addr}->{$message} = 1;
2919         }
2920         else {
2921             $errors{$addr}->{$message}++;
2922         }
2923
2924         # Clear the line to prevent any further (meaningful) processing of it.
2925         $_ = "";
2926
2927         return;
2928     }
2929 } # End closure
2930
2931 package Multi_Default;
2932
2933 # Certain properties in early versions of Unicode had more than one possible
2934 # default for code points missing from the files.  In these cases, one
2935 # default applies to everything left over after all the others are applied,
2936 # and for each of the others, there is a description of which class of code
2937 # points applies to it.  This object helps implement this by storing the
2938 # defaults, and for all but that final default, an eval string that generates
2939 # the class that it applies to.
2940
2941
2942 {   # Closure
2943
2944     main::setup_package();
2945
2946     my %class_defaults;
2947     # The defaults structure for the classes
2948     main::set_access('class_defaults', \%class_defaults);
2949
2950     my %other_default;
2951     # The default that applies to everything left over.
2952     main::set_access('other_default', \%other_default, 'r');
2953
2954
2955     sub new {
2956         # The constructor is called with default => eval pairs, terminated by
2957         # the left-over default. e.g.
2958         # Multi_Default->new(
2959         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2960         #               -  0x200D',
2961         #        'R' => 'some other expression that evaluates to code points',
2962         #        .
2963         #        .
2964         #        .
2965         #        'U'));
2966
2967         my $class = shift;
2968
2969         my $self = bless \do{my $anonymous_scalar}, $class;
2970         my $addr = do { no overloading; pack 'J', $self; };
2971
2972         while (@_ > 1) {
2973             my $default = shift;
2974             my $eval = shift;
2975             $class_defaults{$addr}->{$default} = $eval;
2976         }
2977
2978         $other_default{$addr} = shift;
2979
2980         return $self;
2981     }
2982
2983     sub get_next_defaults {
2984         # Iterates and returns the next class of defaults.
2985         my $self = shift;
2986         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2987
2988         my $addr = do { no overloading; pack 'J', $self; };
2989
2990         return each %{$class_defaults{$addr}};
2991     }
2992 }
2993
2994 package Alias;
2995
2996 # An alias is one of the names that a table goes by.  This class defines them
2997 # including some attributes.  Everything is currently setup in the
2998 # constructor.
2999
3000
3001 {   # Closure
3002
3003     main::setup_package();
3004
3005     my %name;
3006     main::set_access('name', \%name, 'r');
3007
3008     my %loose_match;
3009     # Should this name match loosely or not.
3010     main::set_access('loose_match', \%loose_match, 'r');
3011
3012     my %make_re_pod_entry;
3013     # Some aliases should not get their own entries in the re section of the
3014     # pod, because they are covered by a wild-card, and some we want to
3015     # discourage use of.  Binary
3016     main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3017
3018     my %ucd;
3019     # Is this documented to be accessible via Unicode::UCD
3020     main::set_access('ucd', \%ucd, 'r', 's');
3021
3022     my %status;
3023     # Aliases have a status, like deprecated, or even suppressed (which means
3024     # they don't appear in documentation).  Enum
3025     main::set_access('status', \%status, 'r');
3026
3027     my %ok_as_filename;
3028     # Similarly, some aliases should not be considered as usable ones for
3029     # external use, such as file names, or we don't want documentation to
3030     # recommend them.  Boolean
3031     main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3032
3033     sub new {
3034         my $class = shift;
3035
3036         my $self = bless \do { my $anonymous_scalar }, $class;
3037         my $addr = do { no overloading; pack 'J', $self; };
3038
3039         $name{$addr} = shift;
3040         $loose_match{$addr} = shift;
3041         $make_re_pod_entry{$addr} = shift;
3042         $ok_as_filename{$addr} = shift;
3043         $status{$addr} = shift;
3044         $ucd{$addr} = shift;
3045
3046         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3047
3048         # Null names are never ok externally
3049         $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3050
3051         return $self;
3052     }
3053 }
3054
3055 package Range;
3056
3057 # A range is the basic unit for storing code points, and is described in the
3058 # comments at the beginning of the program.  Each range has a starting code
3059 # point; an ending code point (not less than the starting one); a value
3060 # that applies to every code point in between the two end-points, inclusive;
3061 # and an enum type that applies to the value.  The type is for the user's
3062 # convenience, and has no meaning here, except that a non-zero type is
3063 # considered to not obey the normal Unicode rules for having standard forms.
3064 #
3065 # The same structure is used for both map and match tables, even though in the
3066 # latter, the value (and hence type) is irrelevant and could be used as a
3067 # comment.  In map tables, the value is what all the code points in the range
3068 # map to.  Type 0 values have the standardized version of the value stored as
3069 # well, so as to not have to recalculate it a lot.
3070
3071 sub trace { return main::trace(@_); }
3072
3073 {   # Closure
3074
3075     main::setup_package();
3076
3077     my %start;
3078     main::set_access('start', \%start, 'r', 's');
3079
3080     my %end;
3081     main::set_access('end', \%end, 'r', 's');
3082
3083     my %value;
3084     main::set_access('value', \%value, 'r');
3085
3086     my %type;
3087     main::set_access('type', \%type, 'r');
3088
3089     my %standard_form;
3090     # The value in internal standard form.  Defined only if the type is 0.
3091     main::set_access('standard_form', \%standard_form);
3092
3093     # Note that if these fields change, the dump() method should as well
3094
3095     sub new {
3096         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
3097         my $class = shift;
3098
3099         my $self = bless \do { my $anonymous_scalar }, $class;
3100         my $addr = do { no overloading; pack 'J', $self; };
3101
3102         $start{$addr} = shift;
3103         $end{$addr} = shift;
3104
3105         my %args = @_;
3106
3107         my $value = delete $args{'Value'};  # Can be 0
3108         $value = "" unless defined $value;
3109         $value{$addr} = $value;
3110
3111         $type{$addr} = delete $args{'Type'} || 0;
3112
3113         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3114
3115         return $self;
3116     }
3117
3118     use overload
3119         fallback => 0,
3120         qw("") => "_operator_stringify",
3121         "." => \&main::_operator_dot,
3122         ".=" => \&main::_operator_dot_equal,
3123     ;
3124
3125     sub _operator_stringify {
3126         my $self = shift;
3127         my $addr = do { no overloading; pack 'J', $self; };
3128
3129         # Output it like '0041..0065 (value)'
3130         my $return = sprintf("%04X", $start{$addr})
3131                         .  '..'
3132                         . sprintf("%04X", $end{$addr});
3133         my $value = $value{$addr};
3134         my $type = $type{$addr};
3135         $return .= ' (';
3136         $return .= "$value";
3137         $return .= ", Type=$type" if $type != 0;
3138         $return .= ')';
3139
3140         return $return;
3141     }
3142
3143     sub standard_form {
3144         # Calculate the standard form only if needed, and cache the result.
3145         # The standard form is the value itself if the type is special.
3146         # This represents a considerable CPU and memory saving - at the time
3147         # of writing there are 368676 non-special objects, but the standard
3148         # form is only requested for 22047 of them - ie about 6%.
3149
3150         my $self = shift;
3151         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3152
3153         my $addr = do { no overloading; pack 'J', $self; };
3154
3155         return $standard_form{$addr} if defined $standard_form{$addr};
3156
3157         my $value = $value{$addr};
3158         return $value if $type{$addr};
3159         return $standard_form{$addr} = main::standardize($value);
3160     }
3161
3162     sub dump {
3163         # Human, not machine readable.  For machine readable, comment out this
3164         # entire routine and let the standard one take effect.
3165         my $self = shift;
3166         my $indent = shift;
3167         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3168
3169         my $addr = do { no overloading; pack 'J', $self; };
3170
3171         my $return = $indent
3172                     . sprintf("%04X", $start{$addr})
3173                     . '..'
3174                     . sprintf("%04X", $end{$addr})
3175                     . " '$value{$addr}';";
3176         if (! defined $standard_form{$addr}) {
3177             $return .= "(type=$type{$addr})";
3178         }
3179         elsif ($standard_form{$addr} ne $value{$addr}) {
3180             $return .= "(standard '$standard_form{$addr}')";
3181         }
3182         return $return;
3183     }
3184 } # End closure
3185
3186 package _Range_List_Base;
3187
3188 # Base class for range lists.  A range list is simply an ordered list of
3189 # ranges, so that the ranges with the lowest starting numbers are first in it.
3190 #
3191 # When a new range is added that is adjacent to an existing range that has the
3192 # same value and type, it merges with it to form a larger range.
3193 #
3194 # Ranges generally do not overlap, except that there can be multiple entries
3195 # of single code point ranges.  This is because of NameAliases.txt.
3196 #
3197 # In this program, there is a standard value such that if two different
3198 # values, have the same standard value, they are considered equivalent.  This
3199 # value was chosen so that it gives correct results on Unicode data
3200
3201 # There are a number of methods to manipulate range lists, and some operators
3202 # are overloaded to handle them.
3203
3204 sub trace { return main::trace(@_); }
3205
3206 { # Closure
3207
3208     our $addr;
3209
3210     # Max is initialized to a negative value that isn't adjacent to 0, for
3211     # simpler tests
3212     my $max_init = -2;
3213
3214     main::setup_package();
3215
3216     my %ranges;
3217     # The list of ranges
3218     main::set_access('ranges', \%ranges, 'readable_array');
3219
3220     my %max;
3221     # The highest code point in the list.  This was originally a method, but
3222     # actual measurements said it was used a lot.
3223     main::set_access('max', \%max, 'r');
3224
3225     my %each_range_iterator;
3226     # Iterator position for each_range()
3227     main::set_access('each_range_iterator', \%each_range_iterator);
3228
3229     my %owner_name_of;
3230     # Name of parent this is attached to, if any.  Solely for better error
3231     # messages.
3232     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3233
3234     my %_search_ranges_cache;
3235     # A cache of the previous result from _search_ranges(), for better
3236     # performance
3237     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3238
3239     sub new {
3240         my $class = shift;
3241         my %args = @_;
3242
3243         # Optional initialization data for the range list.
3244         my $initialize = delete $args{'Initialize'};
3245
3246         my $self;
3247
3248         # Use _union() to initialize.  _union() returns an object of this
3249         # class, which means that it will call this constructor recursively.
3250         # But it won't have this $initialize parameter so that it won't
3251         # infinitely loop on this.
3252         return _union($class, $initialize, %args) if defined $initialize;
3253
3254         $self = bless \do { my $anonymous_scalar }, $class;
3255         my $addr = do { no overloading; pack 'J', $self; };
3256
3257         # Optional parent object, only for debug info.
3258         $owner_name_of{$addr} = delete $args{'Owner'};
3259         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3260
3261         # Stringify, in case it is an object.
3262         $owner_name_of{$addr} = "$owner_name_of{$addr}";
3263
3264         # This is used only for error messages, and so a colon is added
3265         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3266
3267         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3268
3269         $max{$addr} = $max_init;
3270
3271         $_search_ranges_cache{$addr} = 0;
3272         $ranges{$addr} = [];
3273
3274         return $self;
3275     }
3276
3277     use overload
3278         fallback => 0,
3279         qw("") => "_operator_stringify",
3280         "." => \&main::_operator_dot,
3281         ".=" => \&main::_operator_dot_equal,
3282     ;
3283
3284     sub _operator_stringify {
3285         my $self = shift;
3286         my $addr = do { no overloading; pack 'J', $self; };
3287
3288         return "Range_List attached to '$owner_name_of{$addr}'"
3289                                                 if $owner_name_of{$addr};
3290         return "anonymous Range_List " . \$self;
3291     }
3292
3293     sub _union {
3294         # Returns the union of the input code points.  It can be called as
3295         # either a constructor or a method.  If called as a method, the result
3296         # will be a new() instance of the calling object, containing the union
3297         # of that object with the other parameter's code points;  if called as
3298         # a constructor, the first parameter gives the class that the new object
3299         # should be, and the second parameter gives the code points to go into
3300         # it.
3301         # In either case, there are two parameters looked at by this routine;
3302         # any additional parameters are passed to the new() constructor.
3303         #
3304         # The code points can come in the form of some object that contains
3305         # ranges, and has a conventionally named method to access them; or
3306         # they can be an array of individual code points (as integers); or
3307         # just a single code point.
3308         #
3309         # If they are ranges, this routine doesn't make any effort to preserve
3310         # the range values and types of one input over the other.  Therefore
3311         # this base class should not allow _union to be called from other than
3312         # initialization code, so as to prevent two tables from being added
3313         # together where the range values matter.  The general form of this
3314         # routine therefore belongs in a derived class, but it was moved here
3315         # to avoid duplication of code.  The failure to overload this in this
3316         # class keeps it safe.
3317         #
3318         # It does make the effort during initialization to accept tables with
3319         # multiple values for the same code point, and to preserve the order
3320         # of these.  If there is only one input range or range set, it doesn't
3321         # sort (as it should already be sorted to the desired order), and will
3322         # accept multiple values per code point.  Otherwise it will merge
3323         # multiple values into a single one.
3324
3325         my $self;
3326         my @args;   # Arguments to pass to the constructor
3327
3328         my $class = shift;
3329
3330         # If a method call, will start the union with the object itself, and
3331         # the class of the new object will be the same as self.
3332         if (ref $class) {
3333             $self = $class;
3334             $class = ref $self;
3335             push @args, $self;
3336         }
3337
3338         # Add the other required parameter.
3339         push @args, shift;
3340         # Rest of parameters are passed on to the constructor
3341
3342         # Accumulate all records from both lists.
3343         my @records;
3344         my $input_count = 0;
3345         for my $arg (@args) {
3346             #local $to_trace = 0 if main::DEBUG;
3347             trace "argument = $arg" if main::DEBUG && $to_trace;
3348             if (! defined $arg) {
3349                 my $message = "";
3350                 if (defined $self) {
3351                     no overloading;
3352                     $message .= $owner_name_of{pack 'J', $self};
3353                 }
3354                 Carp::my_carp_bug($message . "Undefined argument to _union.  No union done.");
3355                 return;
3356             }
3357
3358             $arg = [ $arg ] if ! ref $arg;
3359             my $type = ref $arg;
3360             if ($type eq 'ARRAY') {
3361                 foreach my $element (@$arg) {
3362                     push @records, Range->new($element, $element);
3363                     $input_count++;
3364                 }
3365             }
3366             elsif ($arg->isa('Range')) {
3367                 push @records, $arg;
3368                 $input_count++;
3369             }
3370             elsif ($arg->can('ranges')) {
3371                 push @records, $arg->ranges;
3372                 $input_count++;
3373             }
3374             else {
3375                 my $message = "";
3376                 if (defined $self) {
3377                     no overloading;
3378                     $message .= $owner_name_of{pack 'J', $self};
3379                 }
3380                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3381                 return;
3382             }
3383         }
3384
3385         # Sort with the range containing the lowest ordinal first, but if
3386         # two ranges start at the same code point, sort with the bigger range
3387         # of the two first, because it takes fewer cycles.
3388         if ($input_count > 1) {
3389             @records = sort { ($a->start <=> $b->start)
3390                                       or
3391                                     # if b is shorter than a, b->end will be
3392                                     # less than a->end, and we want to select
3393                                     # a, so want to return -1
3394                                     ($b->end <=> $a->end)
3395                                    } @records;
3396         }
3397
3398         my $new = $class->new(@_);
3399
3400         # Fold in records so long as they add new information.
3401         for my $set (@records) {
3402             my $start = $set->start;
3403             my $end   = $set->end;
3404             my $value = $set->value;
3405             my $type  = $set->type;
3406             if ($start > $new->max) {
3407                 $new->_add_delete('+', $start, $end, $value, Type => $type);
3408             }
3409             elsif ($end > $new->max) {
3410                 $new->_add_delete('+', $new->max +1, $end, $value,
3411                                                                 Type => $type);
3412             }
3413             elsif ($input_count == 1) {
3414                 # Here, overlaps existing range, but is from a single input,
3415                 # so preserve the multiple values from that input.
3416                 $new->_add_delete('+', $start, $end, $value, Type => $type,
3417                                                 Replace => $MULTIPLE_AFTER);
3418             }
3419         }
3420
3421         return $new;
3422     }
3423
3424     sub range_count {        # Return the number of ranges in the range list
3425         my $self = shift;
3426         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3427
3428         no overloading;
3429         return scalar @{$ranges{pack 'J', $self}};
3430     }
3431
3432     sub min {
3433         # Returns the minimum code point currently in the range list, or if
3434         # the range list is empty, 2 beyond the max possible.  This is a
3435         # method because used so rarely, that not worth saving between calls,
3436         # and having to worry about changing it as ranges are added and
3437         # deleted.
3438
3439         my $self = shift;
3440         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3441
3442         my $addr = do { no overloading; pack 'J', $self; };
3443
3444         # If the range list is empty, return a large value that isn't adjacent
3445         # to any that could be in the range list, for simpler tests
3446         return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3447         return $ranges{$addr}->[0]->start;
3448     }
3449
3450     sub contains {
3451         # Boolean: Is argument in the range list?  If so returns $i such that:
3452         #   range[$i]->end < $codepoint <= range[$i+1]->end
3453         # which is one beyond what you want; this is so that the 0th range
3454         # doesn't return false
3455         my $self = shift;
3456         my $codepoint = shift;
3457         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3458
3459         my $i = $self->_search_ranges($codepoint);
3460         return 0 unless defined $i;
3461
3462         # The search returns $i, such that
3463         #   range[$i-1]->end < $codepoint <= range[$i]->end
3464         # So is in the table if and only iff it is at least the start position
3465         # of range $i.
3466         no overloading;
3467         return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3468         return $i + 1;
3469     }
3470
3471     sub containing_range {
3472         # Returns the range object that contains the code point, undef if none
3473
3474         my $self = shift;
3475         my $codepoint = shift;
3476         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3477
3478         my $i = $self->contains($codepoint);
3479         return unless $i;
3480
3481         # contains() returns 1 beyond where we should look
3482         no overloading;
3483         return $ranges{pack 'J', $self}->[$i-1];
3484     }
3485
3486     sub value_of {
3487         # Returns the value associated with the code point, undef if none
3488
3489         my $self = shift;
3490         my $codepoint = shift;
3491         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3492
3493         my $range = $self->containing_range($codepoint);
3494         return unless defined $range;
3495
3496         return $range->value;
3497     }
3498
3499     sub type_of {
3500         # Returns the type of the range containing the code point, undef if
3501         # the code point is not in the table
3502
3503         my $self = shift;
3504         my $codepoint = shift;
3505         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3506
3507         my $range = $self->containing_range($codepoint);
3508         return unless defined $range;
3509
3510         return $range->type;
3511     }
3512
3513     sub _search_ranges {
3514         # Find the range in the list which contains a code point, or where it
3515         # should go if were to add it.  That is, it returns $i, such that:
3516         #   range[$i-1]->end < $codepoint <= range[$i]->end
3517         # Returns undef if no such $i is possible (e.g. at end of table), or
3518         # if there is an error.
3519
3520         my $self = shift;
3521         my $code_point = shift;
3522         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3523
3524         my $addr = do { no overloading; pack 'J', $self; };
3525
3526         return if $code_point > $max{$addr};
3527         my $r = $ranges{$addr};                # The current list of ranges
3528         my $range_list_size = scalar @$r;
3529         my $i;
3530
3531         use integer;        # want integer division
3532
3533         # Use the cached result as the starting guess for this one, because,
3534         # an experiment on 5.1 showed that 90% of the time the cache was the
3535         # same as the result on the next call (and 7% it was one less).
3536         $i = $_search_ranges_cache{$addr};
3537         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3538                                             # from an intervening deletion
3539         #local $to_trace = 1 if main::DEBUG;
3540         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3541         return $i if $code_point <= $r->[$i]->end
3542                      && ($i == 0 || $r->[$i-1]->end < $code_point);
3543
3544         # Here the cache doesn't yield the correct $i.  Try adding 1.
3545         if ($i < $range_list_size - 1
3546             && $r->[$i]->end < $code_point &&
3547             $code_point <= $r->[$i+1]->end)
3548         {
3549             $i++;
3550             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3551             $_search_ranges_cache{$addr} = $i;
3552             return $i;
3553         }
3554
3555         # Here, adding 1 also didn't work.  We do a binary search to
3556         # find the correct position, starting with current $i
3557         my $lower = 0;
3558         my $upper = $range_list_size - 1;
3559         while (1) {
3560             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3561
3562             if ($code_point <= $r->[$i]->end) {
3563
3564                 # Here we have met the upper constraint.  We can quit if we
3565                 # also meet the lower one.
3566                 last if $i == 0 || $r->[$i-1]->end < $code_point;
3567
3568                 $upper = $i;        # Still too high.
3569
3570             }
3571             else {
3572
3573                 # Here, $r[$i]->end < $code_point, so look higher up.
3574                 $lower = $i;
3575             }
3576
3577             # Split search domain in half to try again.
3578             my $temp = ($upper + $lower) / 2;
3579
3580             # No point in continuing unless $i changes for next time
3581             # in the loop.
3582             if ($temp == $i) {
3583
3584                 # We can't reach the highest element because of the averaging.
3585                 # So if one below the upper edge, force it there and try one
3586                 # more time.
3587                 if ($i == $range_list_size - 2) {
3588
3589                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3590                     $i = $range_list_size - 1;
3591
3592                     # Change $lower as well so if fails next time through,
3593                     # taking the average will yield the same $i, and we will
3594                     # quit with the error message just below.
3595                     $lower = $i;
3596                     next;
3597                 }
3598                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
3599                 return;
3600             }
3601             $i = $temp;
3602         } # End of while loop
3603
3604         if (main::DEBUG && $to_trace) {
3605             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3606             trace "i=  [ $i ]", $r->[$i];
3607             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3608         }
3609
3610         # Here we have found the offset.  Cache it as a starting point for the
3611         # next call.
3612         $_search_ranges_cache{$addr} = $i;
3613         return $i;
3614     }
3615
3616     sub _add_delete {
3617         # Add, replace or delete ranges to or from a list.  The $type
3618         # parameter gives which:
3619         #   '+' => insert or replace a range, returning a list of any changed
3620         #          ranges.
3621         #   '-' => delete a range, returning a list of any deleted ranges.
3622         #
3623         # The next three parameters give respectively the start, end, and
3624         # value associated with the range.  'value' should be null unless the
3625         # operation is '+';
3626         #
3627         # The range list is kept sorted so that the range with the lowest
3628         # starting position is first in the list, and generally, adjacent
3629         # ranges with the same values are merged into a single larger one (see
3630         # exceptions below).
3631         #
3632         # There are more parameters; all are key => value pairs:
3633         #   Type    gives the type of the value.  It is only valid for '+'.
3634         #           All ranges have types; if this parameter is omitted, 0 is
3635         #           assumed.  Ranges with type 0 are assumed to obey the
3636         #           Unicode rules for casing, etc; ranges with other types are
3637         #           not.  Otherwise, the type is arbitrary, for the caller's
3638         #           convenience, and looked at only by this routine to keep
3639         #           adjacent ranges of different types from being merged into
3640         #           a single larger range, and when Replace =>
3641         #           $IF_NOT_EQUIVALENT is specified (see just below).
3642         #   Replace  determines what to do if the range list already contains
3643         #            ranges which coincide with all or portions of the input
3644         #            range.  It is only valid for '+':
3645         #       => $NO            means that the new value is not to replace
3646         #                         any existing ones, but any empty gaps of the
3647         #                         range list coinciding with the input range
3648         #                         will be filled in with the new value.
3649         #       => $UNCONDITIONALLY  means to replace the existing values with
3650         #                         this one unconditionally.  However, if the
3651         #                         new and old values are identical, the
3652         #                         replacement is skipped to save cycles
3653         #       => $IF_NOT_EQUIVALENT means to replace the existing values
3654         #          (the default)  with this one if they are not equivalent.
3655         #                         Ranges are equivalent if their types are the
3656         #                         same, and they are the same string; or if
3657         #                         both are type 0 ranges, if their Unicode
3658         #                         standard forms are identical.  In this last
3659         #                         case, the routine chooses the more "modern"
3660         #                         one to use.  This is because some of the
3661         #                         older files are formatted with values that
3662         #                         are, for example, ALL CAPs, whereas the
3663         #                         derived files have a more modern style,
3664         #                         which looks better.  By looking for this
3665         #                         style when the pre-existing and replacement
3666         #                         standard forms are the same, we can move to
3667         #                         the modern style
3668         #       => $MULTIPLE_BEFORE means that if this range duplicates an
3669         #                         existing one, but has a different value,
3670         #                         don't replace the existing one, but insert
3671         #                         this, one so that the same range can occur
3672         #                         multiple times.  They are stored LIFO, so
3673         #                         that the final one inserted is the first one
3674         #                         returned in an ordered search of the table.
3675         #                         If this is an exact duplicate, including the
3676         #                         value, the original will be moved to be
3677         #                         first, before any other duplicate ranges
3678         #                         with different values.
3679         #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3680         #                         FIFO, so that this one is inserted after all
3681         #                         others that currently exist.  If this is an
3682         #                         exact duplicate, including value, of an
3683         #                         existing range, this one is discarded
3684         #                         (leaving the existing one in its original,
3685         #                         higher priority position
3686         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
3687         #
3688         # "same value" means identical for non-type-0 ranges, and it means
3689         # having the same standard forms for type-0 ranges.
3690
3691         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3692
3693         my $self = shift;
3694         my $operation = shift;   # '+' for add/replace; '-' for delete;
3695         my $start = shift;
3696         my $end   = shift;
3697         my $value = shift;
3698
3699         my %args = @_;
3700
3701         $value = "" if not defined $value;        # warning: $value can be "0"
3702
3703         my $replace = delete $args{'Replace'};
3704         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3705
3706         my $type = delete $args{'Type'};
3707         $type = 0 unless defined $type;
3708
3709         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3710
3711         my $addr = do { no overloading; pack 'J', $self; };
3712
3713         if ($operation ne '+' && $operation ne '-') {
3714             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
3715             return;
3716         }
3717         unless (defined $start && defined $end) {
3718             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
3719             return;
3720         }
3721         unless ($end >= $start) {
3722             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
3723             return;
3724         }
3725         if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
3726             Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ".  Adding it anyway");
3727         }
3728         #local $to_trace = 1 if main::DEBUG;
3729
3730         if ($operation eq '-') {
3731             if ($replace != $IF_NOT_EQUIVALENT) {
3732                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
3733                 $replace = $IF_NOT_EQUIVALENT;
3734             }
3735             if ($type) {
3736                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
3737                 $type = 0;
3738             }
3739             if ($value ne "") {
3740                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
3741                 $value = "";
3742             }
3743         }
3744
3745         my $r = $ranges{$addr};               # The current list of ranges
3746         my $range_list_size = scalar @$r;     # And its size
3747         my $max = $max{$addr};                # The current high code point in
3748                                               # the list of ranges
3749
3750         # Do a special case requiring fewer machine cycles when the new range
3751         # starts after the current highest point.  The Unicode input data is
3752         # structured so this is common.
3753         if ($start > $max) {
3754
3755             trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
3756             return if $operation eq '-'; # Deleting a non-existing range is a
3757                                          # no-op
3758
3759             # If the new range doesn't logically extend the current final one
3760             # in the range list, create a new range at the end of the range
3761             # list.  (max cleverly is initialized to a negative number not
3762             # adjacent to 0 if the range list is empty, so even adding a range
3763             # to an empty range list starting at 0 will have this 'if'
3764             # succeed.)
3765             if ($start > $max + 1        # non-adjacent means can't extend.
3766                 || @{$r}[-1]->value ne $value # values differ, can't extend.
3767                 || @{$r}[-1]->type != $type # types differ, can't extend.
3768             ) {
3769                 push @$r, Range->new($start, $end,
3770                                      Value => $value,
3771                                      Type => $type);
3772             }
3773             else {
3774
3775                 # Here, the new range starts just after the current highest in
3776                 # the range list, and they have the same type and value.
3777                 # Extend the current range to incorporate the new one.
3778                 @{$r}[-1]->set_end($end);
3779             }
3780
3781             # This becomes the new maximum.
3782             $max{$addr} = $end;
3783
3784             return;
3785         }
3786         #local $to_trace = 0 if main::DEBUG;
3787
3788         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3789
3790         # Here, the input range isn't after the whole rest of the range list.
3791         # Most likely 'splice' will be needed.  The rest of the routine finds
3792         # the needed splice parameters, and if necessary, does the splice.
3793         # First, find the offset parameter needed by the splice function for
3794         # the input range.  Note that the input range may span multiple
3795         # existing ones, but we'll worry about that later.  For now, just find
3796         # the beginning.  If the input range is to be inserted starting in a
3797         # position not currently in the range list, it must (obviously) come
3798         # just after the range below it, and just before the range above it.
3799         # Slightly less obviously, it will occupy the position currently
3800         # occupied by the range that is to come after it.  More formally, we
3801         # are looking for the position, $i, in the array of ranges, such that:
3802         #
3803         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3804         #
3805         # (The ordered relationships within existing ranges are also shown in
3806         # the equation above).  However, if the start of the input range is
3807         # within an existing range, the splice offset should point to that
3808         # existing range's position in the list; that is $i satisfies a
3809         # somewhat different equation, namely:
3810         #
3811         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3812         #
3813         # More briefly, $start can come before or after r[$i]->start, and at
3814         # this point, we don't know which it will be.  However, these
3815         # two equations share these constraints:
3816         #
3817         #   r[$i-1]->end < $start <= r[$i]->end
3818         #
3819         # And that is good enough to find $i.
3820
3821         my $i = $self->_search_ranges($start);
3822         if (! defined $i) {
3823             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
3824             return;
3825         }
3826
3827         # The search function returns $i such that:
3828         #
3829         # r[$i-1]->end < $start <= r[$i]->end
3830         #
3831         # That means that $i points to the first range in the range list
3832         # that could possibly be affected by this operation.  We still don't
3833         # know if the start of the input range is within r[$i], or if it
3834         # points to empty space between r[$i-1] and r[$i].
3835         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3836
3837         # Special case the insertion of data that is not to replace any
3838         # existing data.
3839         if ($replace == $NO) {  # If $NO, has to be operation '+'
3840             #local $to_trace = 1 if main::DEBUG;
3841             trace "Doesn't replace" if main::DEBUG && $to_trace;
3842
3843             # Here, the new range is to take effect only on those code points
3844             # that aren't already in an existing range.  This can be done by
3845             # looking through the existing range list and finding the gaps in
3846             # the ranges that this new range affects, and then calling this
3847             # function recursively on each of those gaps, leaving untouched
3848             # anything already in the list.  Gather up a list of the changed
3849             # gaps first so that changes to the internal state as new ranges
3850             # are added won't be a problem.
3851             my @gap_list;
3852
3853             # First, if the starting point of the input range is outside an
3854             # existing one, there is a gap from there to the beginning of the
3855             # existing range -- add a span to fill the part that this new
3856             # range occupies
3857             if ($start < $r->[$i]->start) {
3858                 push @gap_list, Range->new($start,
3859                                            main::min($end,
3860                                                      $r->[$i]->start - 1),
3861                                            Type => $type);
3862                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3863             }
3864
3865             # Then look through the range list for other gaps until we reach
3866             # the highest range affected by the input one.
3867             my $j;
3868             for ($j = $i+1; $j < $range_list_size; $j++) {
3869                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3870                 last if $end < $r->[$j]->start;
3871
3872                 # If there is a gap between when this range starts and the
3873                 # previous one ends, add a span to fill it.  Note that just
3874                 # because there are two ranges doesn't mean there is a
3875                 # non-zero gap between them.  It could be that they have
3876                 # different values or types
3877                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3878                     push @gap_list,
3879                         Range->new($r->[$j-1]->end + 1,
3880                                    $r->[$j]->start - 1,
3881                                    Type => $type);
3882                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3883                 }
3884             }
3885
3886             # Here, we have either found an existing range in the range list,
3887             # beyond the area affected by the input one, or we fell off the
3888             # end of the loop because the input range affects the whole rest
3889             # of the range list.  In either case, $j is 1 higher than the
3890             # highest affected range.  If $j == $i, it means that there are no
3891             # affected ranges, that the entire insertion is in the gap between
3892             # r[$i-1], and r[$i], which we already have taken care of before
3893             # the loop.
3894             # On the other hand, if there are affected ranges, it might be
3895             # that there is a gap that needs filling after the final such
3896             # range to the end of the input range
3897             if ($r->[$j-1]->end < $end) {
3898                     push @gap_list, Range->new(main::max($start,
3899                                                          $r->[$j-1]->end + 1),
3900                                                $end,
3901                                                Type => $type);
3902                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3903             }
3904
3905             # Call recursively to fill in all the gaps.
3906             foreach my $gap (@gap_list) {
3907                 $self->_add_delete($operation,
3908                                    $gap->start,
3909                                    $gap->end,
3910                                    $value,
3911                                    Type => $type);
3912             }
3913
3914             return;
3915         }
3916
3917         # Here, we have taken care of the case where $replace is $NO.
3918         # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3919         # If inserting a multiple record, this is where it goes, before the
3920         # first (if any) existing one if inserting LIFO.  (If this is to go
3921         # afterwards, FIFO, we below move the pointer to there.)  These imply
3922         # an insertion, and no change to any existing ranges.  Note that $i
3923         # can be -1 if this new range doesn't actually duplicate any existing,
3924         # and comes at the beginning of the list.
3925         if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
3926
3927             if ($start != $end) {
3928                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
3929                 return;
3930             }
3931
3932             # If the new code point is within a current range ...
3933             if ($end >= $r->[$i]->start) {
3934
3935                 # Don't add an exact duplicate, as it isn't really a multiple
3936                 my $existing_value = $r->[$i]->value;
3937                 my $existing_type = $r->[$i]->type;
3938                 return if $value eq $existing_value && $type eq $existing_type;
3939
3940                 # If the multiple value is part of an existing range, we want
3941                 # to split up that range, so that only the single code point
3942                 # is affected.  To do this, we first call ourselves
3943                 # recursively to delete that code point from the table, having
3944                 # preserved its current data above.  Then we call ourselves
3945                 # recursively again to add the new multiple, which we know by
3946                 # the test just above is different than the current code
3947                 # point's value, so it will become a range containing a single
3948                 # code point: just itself.  Finally, we add back in the
3949                 # pre-existing code point, which will again be a single code
3950                 # point range.  Because 'i' likely will have changed as a
3951                 # result of these operations, we can't just continue on, but
3952                 # do this operation recursively as well.  If we are inserting
3953                 # LIFO, the pre-existing code point needs to go after the new
3954                 # one, so use MULTIPLE_AFTER; and vice versa.
3955                 if ($r->[$i]->start != $r->[$i]->end) {
3956                     $self->_add_delete('-', $start, $end, "");
3957                     $self->_add_delete('+', $start, $end, $value, Type => $type);
3958                     return $self->_add_delete('+',
3959                             $start, $end,
3960                             $existing_value,
3961                             Type => $existing_type,
3962                             Replace => ($replace == $MULTIPLE_BEFORE)
3963                                        ? $MULTIPLE_AFTER
3964                                        : $MULTIPLE_BEFORE);
3965                 }
3966             }
3967
3968             # If to place this new record after, move to beyond all existing
3969             # ones; but don't add this one if identical to any of them, as it
3970             # isn't really a multiple.  This leaves the original order, so
3971             # that the current request is ignored.  The reasoning is that the
3972             # previous request that wanted this record to have high priority
3973             # should have precedence.
3974             if ($replace == $MULTIPLE_AFTER) {
3975                 while ($i < @$r && $r->[$i]->start == $start) {
3976                     return if $value eq $r->[$i]->value
3977                               && $type eq $r->[$i]->type;
3978                     $i++;
3979                 }
3980             }
3981             else {
3982                 # If instead we are to place this new record before any
3983                 # existing ones, remove any identical ones that come after it.
3984                 # This changes the existing order so that the new one is
3985                 # first, as is being requested.
3986                 for (my $j = $i + 1;
3987                      $j < @$r && $r->[$j]->start == $start;
3988                      $j++)
3989                 {
3990                     if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
3991                         splice @$r, $j, 1;
3992                         last;   # There should only be one instance, so no
3993                                 # need to keep looking
3994                     }
3995                 }
3996             }
3997
3998             trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3999             my @return = splice @$r,
4000                                 $i,
4001                                 0,
4002                                 Range->new($start,
4003                                            $end,
4004                                            Value => $value,
4005                                            Type => $type);
4006             if (main::DEBUG && $to_trace) {
4007                 trace "After splice:";
4008                 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4009                 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4010                 trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
4011                 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4012                 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4013                 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4014             }
4015             return @return;
4016         }
4017
4018         # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
4019         # leaves delete, insert, and replace either unconditionally or if not
4020         # equivalent.  $i still points to the first potential affected range.
4021         # Now find the highest range affected, which will determine the length
4022         # parameter to splice.  (The input range can span multiple existing
4023         # ones.)  If this isn't a deletion, while we are looking through the
4024         # range list, see also if this is a replacement rather than a clean
4025         # insertion; that is if it will change the values of at least one
4026         # existing range.  Start off assuming it is an insert, until find it
4027         # isn't.
4028         my $clean_insert = $operation eq '+';
4029         my $j;        # This will point to the highest affected range
4030
4031         # For non-zero types, the standard form is the value itself;
4032         my $standard_form = ($type) ? $value : main::standardize($value);
4033
4034         for ($j = $i; $j < $range_list_size; $j++) {
4035             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4036
4037             # If find a range that it doesn't overlap into, we can stop
4038             # searching
4039             last if $end < $r->[$j]->start;
4040
4041             # Here, overlaps the range at $j.  If the values don't match,
4042             # and so far we think this is a clean insertion, it becomes a
4043             # non-clean insertion, i.e., a 'change' or 'replace' instead.
4044             if ($clean_insert) {
4045                 if ($r->[$j]->standard_form ne $standard_form) {
4046                     $clean_insert = 0;
4047                     if ($replace == $CROAK) {
4048                         main::croak("The range to add "
4049                         . sprintf("%04X", $start)
4050                         . '-'
4051                         . sprintf("%04X", $end)
4052                         . " with value '$value' overlaps an existing range $r->[$j]");
4053                     }
4054                 }
4055                 else {
4056
4057                     # Here, the two values are essentially the same.  If the
4058                     # two are actually identical, replacing wouldn't change
4059                     # anything so skip it.
4060                     my $pre_existing = $r->[$j]->value;
4061                     if ($pre_existing ne $value) {
4062
4063                         # Here the new and old standardized values are the
4064                         # same, but the non-standardized values aren't.  If
4065                         # replacing unconditionally, then replace
4066                         if( $replace == $UNCONDITIONALLY) {
4067                             $clean_insert = 0;
4068                         }
4069                         else {
4070
4071                             # Here, are replacing conditionally.  Decide to
4072                             # replace or not based on which appears to look
4073                             # the "nicest".  If one is mixed case and the
4074                             # other isn't, choose the mixed case one.
4075                             my $new_mixed = $value =~ /[A-Z]/
4076                                             && $value =~ /[a-z]/;
4077                             my $old_mixed = $pre_existing =~ /[A-Z]/
4078                                             && $pre_existing =~ /[a-z]/;
4079
4080                             if ($old_mixed != $new_mixed) {
4081                                 $clean_insert = 0 if $new_mixed;
4082                                 if (main::DEBUG && $to_trace) {
4083                                     if ($clean_insert) {
4084                                         trace "Retaining $pre_existing over $value";
4085                                     }
4086                                     else {
4087                                         trace "Replacing $pre_existing with $value";
4088                                     }
4089                                 }
4090                             }
4091                             else {
4092
4093                                 # Here casing wasn't different between the two.
4094                                 # If one has hyphens or underscores and the
4095                                 # other doesn't, choose the one with the
4096                                 # punctuation.
4097                                 my $new_punct = $value =~ /[-_]/;
4098                                 my $old_punct = $pre_existing =~ /[-_]/;
4099
4100                                 if ($old_punct != $new_punct) {
4101                                     $clean_insert = 0 if $new_punct;
4102                                     if (main::DEBUG && $to_trace) {
4103                                         if ($clean_insert) {
4104                                             trace "Retaining $pre_existing over $value";
4105                                         }
4106                                         else {
4107                                             trace "Replacing $pre_existing with $value";
4108                                         }
4109                                     }
4110                                 }   # else existing one is just as "good";
4111                                     # retain it to save cycles.
4112                             }
4113                         }
4114                     }
4115                 }
4116             }
4117         } # End of loop looking for highest affected range.
4118
4119         # Here, $j points to one beyond the highest range that this insertion
4120         # affects (hence to beyond the range list if that range is the final
4121         # one in the range list).
4122
4123         # The splice length is all the affected ranges.  Get it before
4124         # subtracting, for efficiency, so we don't have to later add 1.
4125         my $length = $j - $i;
4126
4127         $j--;        # $j now points to the highest affected range.
4128         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4129
4130         # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4131         # $j points to the highest affected range.  But it can be < $i or even
4132         # -1.  These happen only if the insertion is entirely in the gap
4133         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
4134         # above exited first time through with $end < $r->[$i]->start.  (And
4135         # then we subtracted one from j)  This implies also that $start <
4136         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4137         # $start, so the entire input range is in the gap.
4138         if ($j < $i) {
4139
4140             # Here the entire input range is in the gap before $i.
4141
4142             if (main::DEBUG && $to_trace) {
4143                 if ($i) {
4144                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
4145                 }
4146                 else {
4147                     trace "Entire range is before $r->[$i]";
4148                 }
4149             }
4150             return if $operation ne '+'; # Deletion of a non-existent range is
4151                                          # a no-op
4152         }
4153         else {
4154
4155             # Here part of the input range is not in the gap before $i.  Thus,
4156             # there is at least one affected one, and $j points to the highest
4157             # such one.
4158
4159             # At this point, here is the situation:
4160             # This is not an insertion of a multiple, nor of tentative ($NO)
4161             # data.
4162             #   $i  points to the first element in the current range list that
4163             #            may be affected by this operation.  In fact, we know
4164             #            that the range at $i is affected because we are in
4165             #            the else branch of this 'if'
4166             #   $j  points to the highest affected range.
4167             # In other words,
4168             #   r[$i-1]->end < $start <= r[$i]->end
4169             # And:
4170             #   r[$i-1]->end < $start <= $end <= r[$j]->end
4171             #
4172             # Also:
4173             #   $clean_insert is a boolean which is set true if and only if
4174             #        this is a "clean insertion", i.e., not a change nor a
4175             #        deletion (multiple was handled above).
4176
4177             # We now have enough information to decide if this call is a no-op
4178             # or not.  It is a no-op if this is an insertion of already
4179             # existing data.
4180
4181             if (main::DEBUG && $to_trace && $clean_insert
4182                                          && $i == $j
4183                                          && $start >= $r->[$i]->start)
4184             {
4185                     trace "no-op";
4186             }
4187             return if $clean_insert
4188                       && $i == $j # more than one affected range => not no-op
4189
4190                       # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
4191                       # Further, $start and/or $end is >= r[$i]->start
4192                       # The test below hence guarantees that
4193                       #     r[$i]->start < $start <= $end <= r[$i]->end
4194                       # This means the input range is contained entirely in
4195                       # the one at $i, so is a no-op
4196                       && $start >= $r->[$i]->start;
4197         }
4198
4199         # Here, we know that some action will have to be taken.  We have
4200         # calculated the offset and length (though adjustments may be needed)
4201         # for the splice.  Now start constructing the replacement list.
4202         my @replacement;
4203         my $splice_start = $i;
4204
4205         my $extends_below;
4206         my $extends_above;
4207
4208         # See if should extend any adjacent ranges.
4209         if ($operation eq '-') { # Don't extend deletions
4210             $extends_below = $extends_above = 0;
4211         }
4212         else {  # Here, should extend any adjacent ranges.  See if there are
4213                 # any.
4214             $extends_below = ($i > 0
4215                             # can't extend unless adjacent
4216                             && $r->[$i-1]->end == $start -1
4217                             # can't extend unless are same standard value
4218                             && $r->[$i-1]->standard_form eq $standard_form
4219                             # can't extend unless share type
4220                             && $r->[$i-1]->type == $type);
4221             $extends_above = ($j+1 < $range_list_size
4222                             && $r->[$j+1]->start == $end +1
4223                             && $r->[$j+1]->standard_form eq $standard_form
4224                             && $r->[$j+1]->type == $type);
4225         }
4226         if ($extends_below && $extends_above) { # Adds to both
4227             $splice_start--;     # start replace at element below
4228             $length += 2;        # will replace on both sides
4229             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4230
4231             # The result will fill in any gap, replacing both sides, and
4232             # create one large range.
4233             @replacement = Range->new($r->[$i-1]->start,
4234                                       $r->[$j+1]->end,
4235                                       Value => $value,
4236                                       Type => $type);
4237         }
4238         else {
4239
4240             # Here we know that the result won't just be the conglomeration of
4241             # a new range with both its adjacent neighbors.  But it could
4242             # extend one of them.
4243
4244             if ($extends_below) {
4245
4246                 # Here the new element adds to the one below, but not to the
4247                 # one above.  If inserting, and only to that one range,  can
4248                 # just change its ending to include the new one.
4249                 if ($length == 0 && $clean_insert) {
4250                     $r->[$i-1]->set_end($end);
4251                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4252                     return;
4253                 }
4254                 else {
4255                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4256                     $splice_start--;        # start replace at element below
4257                     $length++;              # will replace the element below
4258                     $start = $r->[$i-1]->start;
4259                 }
4260             }
4261             elsif ($extends_above) {
4262
4263                 # Here the new element adds to the one above, but not below.
4264                 # Mirror the code above
4265                 if ($length == 0 && $clean_insert) {
4266                     $r->[$j+1]->set_start($start);
4267                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4268                     return;
4269                 }
4270                 else {
4271                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4272                     $length++;        # will replace the element above
4273                     $end = $r->[$j+1]->end;
4274                 }
4275             }
4276
4277             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4278
4279             # Finally, here we know there will have to be a splice.
4280             # If the change or delete affects only the highest portion of the
4281             # first affected range, the range will have to be split.  The
4282             # splice will remove the whole range, but will replace it by a new
4283             # range containing just the unaffected part.  So, in this case,
4284             # add to the replacement list just this unaffected portion.
4285             if (! $extends_below
4286                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4287             {
4288                 push @replacement,
4289                     Range->new($r->[$i]->start,
4290                                $start - 1,
4291                                Value => $r->[$i]->value,
4292                                Type => $r->[$i]->type);
4293             }
4294
4295             # In the case of an insert or change, but not a delete, we have to
4296             # put in the new stuff;  this comes next.
4297             if ($operation eq '+') {
4298                 push @replacement, Range->new($start,
4299                                               $end,
4300                                               Value => $value,
4301                                               Type => $type);
4302             }
4303
4304             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4305             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4306
4307             # And finally, if we're changing or deleting only a portion of the
4308             # highest affected range, it must be split, as the lowest one was.
4309             if (! $extends_above
4310                 && $j >= 0  # Remember that j can be -1 if before first
4311                             # current element
4312                 && $end >= $r->[$j]->start
4313                 && $end < $r->[$j]->end)
4314             {
4315                 push @replacement,
4316                     Range->new($end + 1,
4317                                $r->[$j]->end,
4318                                Value => $r->[$j]->value,
4319                                Type => $r->[$j]->type);
4320             }
4321         }
4322
4323         # And do the splice, as calculated above
4324         if (main::DEBUG && $to_trace) {
4325             trace "replacing $length element(s) at $i with ";
4326             foreach my $replacement (@replacement) {
4327                 trace "    $replacement";
4328             }
4329             trace "Before splice:";
4330             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4331             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4332             trace "i  =[", $i, "]", $r->[$i];
4333             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4334             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4335         }
4336
4337         my @return = splice @$r, $splice_start, $length, @replacement;
4338
4339         if (main::DEBUG && $to_trace) {
4340             trace "After splice:";
4341             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4342             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4343             trace "i  =[", $i, "]", $r->[$i];
4344             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4345             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4346             trace "removed ", @return if @return;
4347         }
4348
4349         # An actual deletion could have changed the maximum in the list.
4350         # There was no deletion if the splice didn't return something, but
4351         # otherwise recalculate it.  This is done too rarely to worry about
4352         # performance.
4353         if ($operation eq '-' && @return) {
4354             if (@$r) {
4355                 $max{$addr} = $r->[-1]->end;
4356             }
4357             else {  # Now empty
4358                 $max{$addr} = $max_init;
4359             }
4360         }
4361         return @return;
4362     }
4363
4364     sub reset_each_range {  # reset the iterator for each_range();
4365         my $self = shift;
4366         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4367
4368         no overloading;
4369         undef $each_range_iterator{pack 'J', $self};
4370         return;
4371     }
4372
4373     sub each_range {
4374         # Iterate over each range in a range list.  Results are undefined if
4375         # the range list is changed during the iteration.
4376
4377         my $self = shift;
4378         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4379
4380         my $addr = do { no overloading; pack 'J', $self; };
4381
4382         return if $self->is_empty;
4383
4384         $each_range_iterator{$addr} = -1
4385                                 if ! defined $each_range_iterator{$addr};
4386         $each_range_iterator{$addr}++;
4387         return $ranges{$addr}->[$each_range_iterator{$addr}]
4388                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
4389         undef $each_range_iterator{$addr};
4390         return;
4391     }
4392
4393     sub count {        # Returns count of code points in range list
4394         my $self = shift;
4395         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4396
4397         my $addr = do { no overloading; pack 'J', $self; };
4398
4399         my $count = 0;
4400         foreach my $range (@{$ranges{$addr}}) {
4401             $count += $range->end - $range->start + 1;
4402         }
4403         return $count;
4404     }
4405
4406     sub delete_range {    # Delete a range
4407         my $self = shift;
4408         my $start = shift;
4409         my $end = shift;
4410
4411         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4412
4413         return $self->_add_delete('-', $start, $end, "");
4414     }
4415
4416     sub is_empty { # Returns boolean as to if a range list is empty
4417         my $self = shift;
4418         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4419
4420         no overloading;
4421         return scalar @{$ranges{pack 'J', $self}} == 0;
4422     }
4423
4424     sub hash {
4425         # Quickly returns a scalar suitable for separating tables into
4426         # buckets, i.e. it is a hash function of the contents of a table, so
4427         # there are relatively few conflicts.
4428
4429         my $self = shift;
4430         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4431
4432         my $addr = do { no overloading; pack 'J', $self; };
4433
4434         # These are quickly computable.  Return looks like 'min..max;count'
4435         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4436     }
4437 } # End closure for _Range_List_Base
4438
4439 package Range_List;
4440 use parent '-norequire', '_Range_List_Base';
4441
4442 # A Range_List is a range list for match tables; i.e. the range values are
4443 # not significant.  Thus a number of operations can be safely added to it,
4444 # such as inversion, intersection.  Note that union is also an unsafe
4445 # operation when range values are cared about, and that method is in the base
4446 # class, not here.  But things are set up so that that method is callable only
4447 # during initialization.  Only in this derived class, is there an operation
4448 # that combines two tables.  A Range_Map can thus be used to initialize a
4449 # Range_List, and its mappings will be in the list, but are not significant to
4450 # this class.
4451
4452 sub trace { return main::trace(@_); }
4453
4454 { # Closure
4455
4456     use overload
4457         fallback => 0,
4458         '+' => sub { my $self = shift;
4459                     my $other = shift;
4460
4461                     return $self->_union($other)
4462                 },
4463         '+=' => sub { my $self = shift;
4464                     my $other = shift;
4465                     my $reversed = shift;
4466
4467                     if ($reversed) {
4468                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4469                         . ref($other)
4470                         . ' += '
4471                         . ref($self)
4472                         . "'.  undef returned.");
4473                         return;
4474                     }
4475
4476                     return $self->_union($other)
4477                 },
4478         '&' => sub { my $self = shift;
4479                     my $other = shift;
4480
4481                     return $self->_intersect($other, 0);
4482                 },
4483         '&=' => sub { my $self = shift;
4484                     my $other = shift;
4485                     my $reversed = shift;
4486
4487                     if ($reversed) {
4488                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4489                         . ref($other)
4490                         . ' &= '
4491                         . ref($self)
4492                         . "'.  undef returned.");
4493                         return;
4494                     }
4495
4496                     return $self->_intersect($other, 0);
4497                 },
4498         '~' => "_invert",
4499         '-' => "_subtract",
4500     ;
4501
4502     sub _invert {
4503         # Returns a new Range_List that gives all code points not in $self.
4504
4505         my $self = shift;
4506
4507         my $new = Range_List->new;
4508
4509         # Go through each range in the table, finding the gaps between them
4510         my $max = -1;   # Set so no gap before range beginning at 0
4511         for my $range ($self->ranges) {
4512             my $start = $range->start;
4513             my $end   = $range->end;
4514
4515             # If there is a gap before this range, the inverse will contain
4516             # that gap.
4517             if ($start > $max + 1) {
4518                 $new->add_range($max + 1, $start - 1);
4519             }
4520             $max = $end;
4521         }
4522
4523         # And finally, add the gap from the end of the table to the max
4524         # possible code point
4525         if ($max < $MAX_UNICODE_CODEPOINT) {
4526             $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
4527         }
4528         return $new;
4529     }
4530
4531     sub _subtract {
4532         # Returns a new Range_List with the argument deleted from it.  The
4533         # argument can be a single code point, a range, or something that has
4534         # a range, with the _range_list() method on it returning them
4535
4536         my $self = shift;
4537         my $other = shift;
4538         my $reversed = shift;
4539         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4540
4541         if ($reversed) {
4542             Carp::my_carp_bug("Bad news.  Can't cope with '"
4543             . ref($other)
4544             . ' - '
4545             . ref($self)
4546             . "'.  undef returned.");
4547             return;
4548         }
4549
4550         my $new = Range_List->new(Initialize => $self);
4551
4552         if (! ref $other) { # Single code point
4553             $new->delete_range($other, $other);
4554         }
4555         elsif ($other->isa('Range')) {
4556             $new->delete_range($other->start, $other->end);
4557         }
4558         elsif ($other->can('_range_list')) {
4559             foreach my $range ($other->_range_list->ranges) {
4560                 $new->delete_range($range->start, $range->end);
4561             }
4562         }
4563         else {
4564             Carp::my_carp_bug("Can't cope with a "
4565                         . ref($other)
4566                         . " argument to '-'.  Subtraction ignored."
4567                         );
4568             return $self;
4569         }
4570
4571         return $new;
4572     }
4573
4574     sub _intersect {
4575         # Returns either a boolean giving whether the two inputs' range lists
4576         # intersect (overlap), or a new Range_List containing the intersection
4577         # of the two lists.  The optional final parameter being true indicates
4578         # to do the check instead of the intersection.
4579
4580         my $a_object = shift;
4581         my $b_object = shift;
4582         my $check_if_overlapping = shift;
4583         $check_if_overlapping = 0 unless defined $check_if_overlapping;
4584         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4585
4586         if (! defined $b_object) {
4587             my $message = "";
4588             $message .= $a_object->_owner_name_of if defined $a_object;
4589             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4590             return;
4591         }
4592
4593         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4594         # Thus the intersection could be much more simply be written:
4595         #   return ~(~$a_object + ~$b_object);
4596         # But, this is slower, and when taking the inverse of a large
4597         # range_size_1 table, back when such tables were always stored that
4598         # way, it became prohibitively slow, hence the code was changed to the
4599         # below
4600
4601         if ($b_object->isa('Range')) {
4602             $b_object = Range_List->new(Initialize => $b_object,
4603                                         Owner => $a_object->_owner_name_of);
4604         }
4605         $b_object = $b_object->_range_list if $b_object->can('_range_list');
4606
4607         my @a_ranges = $a_object->ranges;
4608         my @b_ranges = $b_object->ranges;
4609
4610         #local $to_trace = 1 if main::DEBUG;
4611         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4612
4613         # Start with the first range in each list
4614         my $a_i = 0;
4615         my $range_a = $a_ranges[$a_i];
4616         my $b_i = 0;
4617         my $range_b = $b_ranges[$b_i];
4618
4619         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4620                                                 if ! $check_if_overlapping;
4621
4622         # If either list is empty, there is no intersection and no overlap
4623         if (! defined $range_a || ! defined $range_b) {
4624             return $check_if_overlapping ? 0 : $new;
4625         }
4626         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4627
4628         # Otherwise, must calculate the intersection/overlap.  Start with the
4629         # very first code point in each list
4630         my $a = $range_a->start;
4631         my $b = $range_b->start;
4632
4633         # Loop through all the ranges of each list; in each iteration, $a and
4634         # $b are the current code points in their respective lists
4635         while (1) {
4636
4637             # If $a and $b are the same code point, ...
4638             if ($a == $b) {
4639
4640                 # it means the lists overlap.  If just checking for overlap
4641                 # know the answer now,
4642                 return 1 if $check_if_overlapping;
4643
4644                 # The intersection includes this code point plus anything else
4645                 # common to both current ranges.
4646                 my $start = $a;
4647                 my $end = main::min($range_a->end, $range_b->end);
4648                 if (! $check_if_overlapping) {
4649                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4650                     $new->add_range($start, $end);
4651                 }
4652
4653                 # Skip ahead to the end of the current intersect
4654                 $a = $b = $end;
4655
4656                 # If the current intersect ends at the end of either range (as
4657                 # it must for at least one of them), the next possible one
4658                 # will be the beginning code point in it's list's next range.
4659                 if ($a == $range_a->end) {
4660                     $range_a = $a_ranges[++$a_i];
4661                     last unless defined $range_a;
4662                     $a = $range_a->start;
4663                 }
4664                 if ($b == $range_b->end) {
4665                     $range_b = $b_ranges[++$b_i];
4666                     last unless defined $range_b;
4667                     $b = $range_b->start;
4668                 }
4669
4670                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4671             }
4672             elsif ($a < $b) {
4673
4674                 # Not equal, but if the range containing $a encompasses $b,
4675                 # change $a to be the middle of the range where it does equal
4676                 # $b, so the next iteration will get the intersection
4677                 if ($range_a->end >= $b) {
4678                     $a = $b;
4679                 }
4680                 else {
4681
4682                     # Here, the current range containing $a is entirely below
4683                     # $b.  Go try to find a range that could contain $b.
4684                     $a_i = $a_object->_search_ranges($b);
4685
4686                     # If no range found, quit.
4687                     last unless defined $a_i;
4688
4689                     # The search returns $a_i, such that
4690                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4691                     # Set $a to the beginning of this new range, and repeat.
4692                     $range_a = $a_ranges[$a_i];
4693                     $a = $range_a->start;
4694                 }
4695             }
4696             else { # Here, $b < $a.
4697
4698                 # Mirror image code to the leg just above
4699                 if ($range_b->end >= $a) {
4700                     $b = $a;
4701                 }
4702                 else {
4703                     $b_i = $b_object->_search_ranges($a);
4704                     last unless defined $b_i;
4705                     $range_b = $b_ranges[$b_i];
4706                     $b = $range_b->start;
4707                 }
4708             }
4709         } # End of looping through ranges.
4710
4711         # Intersection fully computed, or now know that there is no overlap
4712         return $check_if_overlapping ? 0 : $new;
4713     }
4714
4715     sub overlaps {
4716         # Returns boolean giving whether the two arguments overlap somewhere
4717
4718         my $self = shift;
4719         my $other = shift;
4720         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4721
4722         return $self->_intersect($other, 1);
4723     }
4724
4725     sub add_range {
4726         # Add a range to the list.
4727
4728         my $self = shift;
4729         my $start = shift;
4730         my $end = shift;
4731         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4732
4733         return $self->_add_delete('+', $start, $end, "");
4734     }
4735
4736     sub matches_identically_to {
4737         # Return a boolean as to whether or not two Range_Lists match identical
4738         # sets of code points.
4739
4740         my $self = shift;
4741         my $other = shift;
4742         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4743
4744         # These are ordered in increasing real time to figure out (at least
4745         # until a patch changes that and doesn't change this)
4746         return 0 if $self->max != $other->max;
4747         return 0 if $self->min != $other->min;
4748         return 0 if $self->range_count != $other->range_count;
4749         return 0 if $self->count != $other->count;
4750
4751         # Here they could be identical because all the tests above passed.
4752         # The loop below is somewhat simpler since we know they have the same
4753         # number of elements.  Compare range by range, until reach the end or
4754         # find something that differs.
4755         my @a_ranges = $self->ranges;
4756         my @b_ranges = $other->ranges;
4757         for my $i (0 .. @a_ranges - 1) {
4758             my $a = $a_ranges[$i];
4759             my $b = $b_ranges[$i];
4760             trace "self $a; other $b" if main::DEBUG && $to_trace;
4761             return 0 if ! defined $b
4762                         || $a->start != $b->start
4763                         || $a->end != $b->end;
4764         }
4765         return 1;
4766     }
4767
4768     sub is_code_point_usable {
4769         # This used only for making the test script.  See if the input
4770         # proposed trial code point is one that Perl will handle.  If second
4771         # parameter is 0, it won't select some code points for various
4772         # reasons, noted below.
4773
4774         my $code = shift;
4775         my $try_hard = shift;
4776         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4777
4778         return 0 if $code < 0;                # Never use a negative
4779
4780         # shun null.  I'm (khw) not sure why this was done, but NULL would be
4781         # the character very frequently used.
4782         return $try_hard if $code == 0x0000;
4783
4784         # shun non-character code points.
4785         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4786         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4787
4788         return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
4789         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4790
4791         return 1;
4792     }
4793
4794     sub get_valid_code_point {
4795         # Return a code point that's part of the range list.  Returns nothing
4796         # if the table is empty or we can't find a suitable code point.  This
4797         # used only for making the test script.
4798
4799         my $self = shift;
4800         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4801
4802         my $addr = do { no overloading; pack 'J', $self; };
4803
4804         # On first pass, don't choose less desirable code points; if no good
4805         # one is found, repeat, allowing a less desirable one to be selected.
4806         for my $try_hard (0, 1) {
4807
4808             # Look through all the ranges for a usable code point.
4809             for my $set (reverse $self->ranges) {
4810
4811                 # Try the edge cases first, starting with the end point of the
4812                 # range.
4813                 my $end = $set->end;
4814                 return $end if is_code_point_usable($end, $try_hard);
4815
4816                 # End point didn't, work.  Start at the beginning and try
4817                 # every one until find one that does work.
4818                 for my $trial ($set->start .. $end - 1) {
4819                     return $trial if is_code_point_usable($trial, $try_hard);
4820                 }
4821             }
4822         }
4823         return ();  # If none found, give up.
4824     }
4825
4826     sub get_invalid_code_point {
4827         # Return a code point that's not part of the table.  Returns nothing
4828         # if the table covers all code points or a suitable code point can't
4829         # be found.  This used only for making the test script.
4830
4831         my $self = shift;
4832         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4833
4834         # Just find a valid code point of the inverse, if any.
4835         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4836     }
4837 } # end closure for Range_List
4838
4839 package Range_Map;
4840 use parent '-norequire', '_Range_List_Base';
4841
4842 # A Range_Map is a range list in which the range values (called maps) are
4843 # significant, and hence shouldn't be manipulated by our other code, which
4844 # could be ambiguous or lose things.  For example, in taking the union of two
4845 # lists, which share code points, but which have differing values, which one
4846 # has precedence in the union?
4847 # It turns out that these operations aren't really necessary for map tables,
4848 # and so this class was created to make sure they aren't accidentally
4849 # applied to them.
4850
4851 { # Closure
4852
4853     sub add_map {
4854         # Add a range containing a mapping value to the list
4855
4856         my $self = shift;
4857         # Rest of parameters passed on
4858
4859         return $self->_add_delete('+', @_);
4860     }
4861
4862     sub add_duplicate {
4863         # Adds entry to a range list which can duplicate an existing entry
4864
4865         my $self = shift;
4866         my $code_point = shift;
4867         my $value = shift;
4868         my %args = @_;
4869         my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4870         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4871
4872         return $self->add_map($code_point, $code_point,
4873                                 $value, Replace => $replace);
4874     }
4875 } # End of closure for package Range_Map
4876
4877 package _Base_Table;
4878
4879 # A table is the basic data structure that gets written out into a file for
4880 # use by the Perl core.  This is the abstract base class implementing the
4881 # common elements from the derived ones.  A list of the methods to be
4882 # furnished by an implementing class is just after the constructor.
4883
4884 sub standardize { return main::standardize($_[0]); }
4885 sub trace { return main::trace(@_); }
4886
4887 { # Closure
4888
4889     main::setup_package();
4890
4891     my %range_list;
4892     # Object containing the ranges of the table.
4893     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4894
4895     my %full_name;
4896     # The full table name.
4897     main::set_access('full_name', \%full_name, 'r');
4898
4899     my %name;
4900     # The table name, almost always shorter
4901     main::set_access('name', \%name, 'r');
4902
4903     my %short_name;
4904     # The shortest of all the aliases for this table, with underscores removed
4905     main::set_access('short_name', \%short_name);
4906
4907     my %nominal_short_name_length;
4908     # The length of short_name before removing underscores
4909     main::set_access('nominal_short_name_length',
4910                     \%nominal_short_name_length);
4911
4912     my %complete_name;
4913     # The complete name, including property.
4914     main::set_access('complete_name', \%complete_name, 'r');
4915
4916     my %property;
4917     # Parent property this table is attached to.
4918     main::set_access('property', \%property, 'r');
4919
4920     my %aliases;
4921     # Ordered list of alias objects of the table's name.  The first ones in
4922     # the list are output first in comments
4923     main::set_access('aliases', \%aliases, 'readable_array');
4924
4925     my %comment;
4926     # A comment associated with the table for human readers of the files
4927     main::set_access('comment', \%comment, 's');
4928
4929     my %description;
4930     # A comment giving a short description of the table's meaning for human
4931     # readers of the files.
4932     main::set_access('description', \%description, 'readable_array');
4933
4934     my %note;
4935     # A comment giving a short note about the table for human readers of the
4936     # files.
4937     main::set_access('note', \%note, 'readable_array');
4938
4939     my %fate;
4940     # Enum; there are a number of possibilities for what happens to this
4941     # table: it could be normal, or suppressed, or not for external use.  See
4942     # values at definition for $SUPPRESSED.
4943     main::set_access('fate', \%fate, 'r');
4944
4945     my %find_table_from_alias;
4946     # The parent property passes this pointer to a hash which this class adds
4947     # all its aliases to, so that the parent can quickly take an alias and
4948     # find this table.
4949     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4950
4951     my %locked;
4952     # After this table is made equivalent to another one; we shouldn't go
4953     # changing the contents because that could mean it's no longer equivalent
4954     main::set_access('locked', \%locked, 'r');
4955
4956     my %file_path;
4957     # This gives the final path to the file containing the table.  Each
4958     # directory in the path is an element in the array
4959     main::set_access('file_path', \%file_path, 'readable_array');
4960
4961     my %status;
4962     # What is the table's status, normal, $OBSOLETE, etc.  Enum
4963     main::set_access('status', \%status, 'r');
4964
4965     my %status_info;
4966     # A comment about its being obsolete, or whatever non normal status it has
4967     main::set_access('status_info', \%status_info, 'r');
4968
4969     my %caseless_equivalent;
4970     # The table this is equivalent to under /i matching, if any.
4971     main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4972
4973     my %range_size_1;
4974     # Is the table to be output with each range only a single code point?
4975     # This is done to avoid breaking existing code that may have come to rely
4976     # on this behavior in previous versions of this program.)
4977     main::set_access('range_size_1', \%range_size_1, 'r', 's');
4978
4979     my %perl_extension;
4980     # A boolean set iff this table is a Perl extension to the Unicode
4981     # standard.
4982     main::set_access('perl_extension', \%perl_extension, 'r');
4983
4984     my %output_range_counts;
4985     # A boolean set iff this table is to have comments written in the
4986     # output file that contain the number of code points in the range.
4987     # The constructor can override the global flag of the same name.
4988     main::set_access('output_range_counts', \%output_range_counts, 'r');
4989
4990     my %format;
4991     # The format of the entries of the table.  This is calculated from the
4992     # data in the table (or passed in the constructor).  This is an enum e.g.,
4993     # $STRING_FORMAT.  It is marked protected as it should not be generally
4994     # used to override calculations.
4995     main::set_access('format', \%format, 'r', 'p_s');
4996
4997     sub new {
4998         # All arguments are key => value pairs, which you can see below, most
4999         # of which match fields documented above.  Otherwise: Re_Pod_Entry,
5000         # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5001         # documented in the Alias package
5002
5003         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5004
5005         my $class = shift;
5006
5007         my $self = bless \do { my $anonymous_scalar }, $class;
5008         my $addr = do { no overloading; pack 'J', $self; };
5009
5010         my %args = @_;
5011
5012         $name{$addr} = delete $args{'Name'};
5013         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5014         $full_name{$addr} = delete $args{'Full_Name'};
5015         my $complete_name = $complete_name{$addr}
5016                           = delete $args{'Complete_Name'};
5017         $format{$addr} = delete $args{'Format'};
5018         $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5019         $property{$addr} = delete $args{'_Property'};
5020         $range_list{$addr} = delete $args{'_Range_List'};
5021         $status{$addr} = delete $args{'Status'} || $NORMAL;
5022         $status_info{$addr} = delete $args{'_Status_Info'} || "";
5023         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5024         $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5025         $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5026         my $ucd = delete $args{'UCD'};
5027
5028         my $description = delete $args{'Description'};
5029         my $ok_as_filename = delete $args{'OK_as_Filename'};
5030         my $loose_match = delete $args{'Fuzzy'};
5031         my $note = delete $args{'Note'};
5032         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5033         my $perl_extension = delete $args{'Perl_Extension'};
5034
5035         # Shouldn't have any left over
5036         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5037
5038         # Can't use || above because conceivably the name could be 0, and
5039         # can't use // operator in case this program gets used in Perl 5.8
5040         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5041         $output_range_counts{$addr} = $output_range_counts if
5042                                         ! defined $output_range_counts{$addr};
5043
5044         $aliases{$addr} = [ ];
5045         $comment{$addr} = [ ];
5046         $description{$addr} = [ ];
5047         $note{$addr} = [ ];
5048         $file_path{$addr} = [ ];
5049         $locked{$addr} = "";
5050
5051         push @{$description{$addr}}, $description if $description;
5052         push @{$note{$addr}}, $note if $note;
5053
5054         if ($fate{$addr} == $PLACEHOLDER) {
5055
5056             # A placeholder table doesn't get documented, is a perl extension,
5057             # and quite likely will be empty
5058             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5059             $perl_extension = 1 if ! defined $perl_extension;
5060             $ucd = 0 if ! defined $ucd;
5061             push @tables_that_may_be_empty, $complete_name{$addr};
5062             $self->add_comment(<<END);
5063 This is a placeholder because it is not in Version $string_version of Unicode,
5064 but is needed by the Perl core to work gracefully.  Because it is not in this
5065 version of Unicode, it will not be listed in $pod_file.pod
5066 END
5067         }
5068         elsif (exists $why_suppressed{$complete_name}
5069                 # Don't suppress if overridden
5070                 && ! grep { $_ eq $complete_name{$addr} }
5071                                                     @output_mapped_properties)
5072         {
5073             $fate{$addr} = $SUPPRESSED;
5074         }
5075         elsif ($fate{$addr} == $SUPPRESSED
5076                && ! exists $why_suppressed{$property{$addr}->complete_name})
5077         {
5078             Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
5079             # perhaps Fate => [ $SUPPRESSED, "reason" ]
5080         }
5081
5082         # If hasn't set its status already, see if it is on one of the
5083         # lists of properties or tables that have particular statuses; if
5084         # not, is normal.  The lists are prioritized so the most serious
5085         # ones are checked first
5086         if (! $status{$addr}) {
5087             if (exists $why_deprecated{$complete_name}) {
5088                 $status{$addr} = $DEPRECATED;
5089             }
5090             elsif (exists $why_stabilized{$complete_name}) {
5091                 $status{$addr} = $STABILIZED;
5092             }
5093             elsif (exists $why_obsolete{$complete_name}) {
5094                 $status{$addr} = $OBSOLETE;
5095             }
5096
5097             # Existence above doesn't necessarily mean there is a message
5098             # associated with it.  Use the most serious message.
5099             if ($status{$addr}) {
5100                 if ($why_deprecated{$complete_name}) {
5101                     $status_info{$addr}
5102                                 = $why_deprecated{$complete_name};
5103                 }
5104                 elsif ($why_stabilized{$complete_name}) {
5105                     $status_info{$addr}
5106                                 = $why_stabilized{$complete_name};
5107                 }
5108                 elsif ($why_obsolete{$complete_name}) {
5109                     $status_info{$addr}
5110                                 = $why_obsolete{$complete_name};
5111                 }
5112             }
5113         }
5114
5115         $perl_extension{$addr} = $perl_extension || 0;
5116
5117         # Don't list a property by default that is internal only
5118         if ($fate{$addr} > $MAP_PROXIED) {
5119             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5120             $ucd = 0 if ! defined $ucd;
5121         }
5122         else {
5123             $ucd = 1 if ! defined $ucd;
5124         }
5125
5126         # By convention what typically gets printed only or first is what's
5127         # first in the list, so put the full name there for good output
5128         # clarity.  Other routines rely on the full name being first on the
5129         # list
5130         $self->add_alias($full_name{$addr},
5131                             OK_as_Filename => $ok_as_filename,
5132                             Fuzzy => $loose_match,
5133                             Re_Pod_Entry => $make_re_pod_entry,
5134                             Status => $status{$addr},
5135                             UCD => $ucd,
5136                             );
5137
5138         # Then comes the other name, if meaningfully different.
5139         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5140             $self->add_alias($name{$addr},
5141                             OK_as_Filename => $ok_as_filename,
5142                             Fuzzy => $loose_match,
5143                             Re_Pod_Entry => $make_re_pod_entry,
5144                             Status => $status{$addr},
5145                             UCD => $ucd,
5146                             );
5147         }
5148
5149         return $self;
5150     }
5151
5152     # Here are the methods that are required to be defined by any derived
5153     # class
5154     for my $sub (qw(
5155                     handle_special_range
5156                     append_to_body
5157                     pre_body
5158                 ))
5159                 # write() knows how to write out normal ranges, but it calls
5160                 # handle_special_range() when it encounters a non-normal one.
5161                 # append_to_body() is called by it after it has handled all
5162                 # ranges to add anything after the main portion of the table.
5163                 # And finally, pre_body() is called after all this to build up
5164                 # anything that should appear before the main portion of the
5165                 # table.  Doing it this way allows things in the middle to
5166                 # affect what should appear before the main portion of the
5167                 # table.
5168     {
5169         no strict "refs";
5170         *$sub = sub {
5171             Carp::my_carp_bug( __LINE__
5172                               . ": Must create method '$sub()' for "
5173                               . ref shift);
5174             return;
5175         }
5176     }
5177
5178     use overload
5179         fallback => 0,
5180         "." => \&main::_operator_dot,
5181         ".=" => \&main::_operator_dot_equal,
5182         '!=' => \&main::_operator_not_equal,
5183         '==' => \&main::_operator_equal,
5184     ;
5185
5186     sub ranges {
5187         # Returns the array of ranges associated with this table.
5188
5189         no overloading;
5190         return $range_list{pack 'J', shift}->ranges;
5191     }
5192
5193     sub add_alias {
5194         # Add a synonym for this table.
5195
5196         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5197
5198         my $self = shift;
5199         my $name = shift;       # The name to add.
5200         my $pointer = shift;    # What the alias hash should point to.  For
5201                                 # map tables, this is the parent property;
5202                                 # for match tables, it is the table itself.
5203
5204         my %args = @_;
5205         my $loose_match = delete $args{'Fuzzy'};
5206
5207         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5208         $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
5209
5210         my $ok_as_filename = delete $args{'OK_as_Filename'};
5211         $ok_as_filename = 1 unless defined $ok_as_filename;
5212
5213         my $status = delete $args{'Status'};
5214         $status = $NORMAL unless defined $status;
5215
5216         # An internal name does not get documented, unless overridden by the
5217         # input.
5218         my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5219
5220         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5221
5222         # Capitalize the first letter of the alias unless it is one of the CJK
5223         # ones which specifically begins with a lower 'k'.  Do this because
5224         # Unicode has varied whether they capitalize first letters or not, and
5225         # have later changed their minds and capitalized them, but not the
5226         # other way around.  So do it always and avoid changes from release to
5227         # release
5228         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5229
5230         my $addr = do { no overloading; pack 'J', $self; };
5231
5232         # Figure out if should be loosely matched if not already specified.
5233         if (! defined $loose_match) {
5234
5235             # Is a loose_match if isn't null, and doesn't begin with an
5236             # underscore and isn't just a number
5237             if ($name ne ""
5238                 && substr($name, 0, 1) ne '_'
5239                 && $name !~ qr{^[0-9_.+-/]+$})
5240             {
5241                 $loose_match = 1;
5242             }
5243             else {
5244                 $loose_match = 0;
5245             }
5246         }
5247
5248         # If this alias has already been defined, do nothing.
5249         return if defined $find_table_from_alias{$addr}->{$name};
5250
5251         # That includes if it is standardly equivalent to an existing alias,
5252         # in which case, add this name to the list, so won't have to search
5253         # for it again.
5254         my $standard_name = main::standardize($name);
5255         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5256             $find_table_from_alias{$addr}->{$name}
5257                         = $find_table_from_alias{$addr}->{$standard_name};
5258             return;
5259         }
5260
5261         # Set the index hash for this alias for future quick reference.
5262         $find_table_from_alias{$addr}->{$name} = $pointer;
5263         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5264         local $to_trace = 0 if main::DEBUG;
5265         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5266         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5267
5268
5269         # Put the new alias at the end of the list of aliases unless the final
5270         # element begins with an underscore (meaning it is for internal perl
5271         # use) or is all numeric, in which case, put the new one before that
5272         # one.  This floats any all-numeric or underscore-beginning aliases to
5273         # the end.  This is done so that they are listed last in output lists,
5274         # to encourage the user to use a better name (either more descriptive
5275         # or not an internal-only one) instead.  This ordering is relied on
5276         # implicitly elsewhere in this program, like in short_name()
5277         my $list = $aliases{$addr};
5278         my $insert_position = (@$list == 0
5279                                 || (substr($list->[-1]->name, 0, 1) ne '_'
5280                                     && $list->[-1]->name =~ /\D/))
5281                             ? @$list
5282                             : @$list - 1;
5283         splice @$list,
5284                 $insert_position,
5285                 0,
5286                 Alias->new($name, $loose_match, $make_re_pod_entry,
5287                                                 $ok_as_filename, $status, $ucd);
5288
5289         # This name may be shorter than any existing ones, so clear the cache
5290         # of the shortest, so will have to be recalculated.
5291         no overloading;
5292         undef $short_name{pack 'J', $self};
5293         return;
5294     }
5295
5296     sub short_name {
5297         # Returns a name suitable for use as the base part of a file name.
5298         # That is, shorter wins.  It can return undef if there is no suitable
5299         # name.  The name has all non-essential underscores removed.
5300
5301         # The optional second parameter is a reference to a scalar in which
5302         # this routine will store the length the returned name had before the
5303         # underscores were removed, or undef if the return is undef.
5304
5305         # The shortest name can change if new aliases are added.  So using
5306         # this should be deferred until after all these are added.  The code
5307         # that does that should clear this one's cache.
5308         # Any name with alphabetics is preferred over an all numeric one, even
5309         # if longer.
5310
5311         my $self = shift;
5312         my $nominal_length_ptr = shift;
5313         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5314
5315         my $addr = do { no overloading; pack 'J', $self; };
5316
5317         # For efficiency, don't recalculate, but this means that adding new
5318         # aliases could change what the shortest is, so the code that does
5319         # that needs to undef this.
5320         if (defined $short_name{$addr}) {
5321             if ($nominal_length_ptr) {
5322                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5323             }
5324             return $short_name{$addr};
5325         }
5326
5327         # Look at each alias
5328         foreach my $alias ($self->aliases()) {
5329
5330             # Don't use an alias that isn't ok to use for an external name.
5331             next if ! $alias->ok_as_filename;
5332
5333             my $name = main::Standardize($alias->name);
5334             trace $self, $name if main::DEBUG && $to_trace;
5335
5336             # Take the first one, or a shorter one that isn't numeric.  This
5337             # relies on numeric aliases always being last in the array
5338             # returned by aliases().  Any alpha one will have precedence.
5339             if (! defined $short_name{$addr}
5340                 || ($name =~ /\D/
5341                     && length($name) < length($short_name{$addr})))
5342             {
5343                 # Remove interior underscores.
5344                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5345
5346                 $nominal_short_name_length{$addr} = length $name;
5347             }
5348         }
5349
5350         # If the short name isn't a nice one, perhaps an equivalent table has
5351         # a better one.
5352         if (! defined $short_name{$addr}
5353             || $short_name{$addr} eq ""
5354             || $short_name{$addr} eq "_")
5355         {
5356             my $return;
5357             foreach my $follower ($self->children) {    # All equivalents
5358                 my $follower_name = $follower->short_name;
5359                 next unless defined $follower_name;
5360
5361                 # Anything (except undefined) is better than underscore or
5362                 # empty
5363                 if (! defined $return || $return eq "_") {
5364                     $return = $follower_name;
5365                     next;
5366                 }
5367
5368                 # If the new follower name isn't "_" and is shorter than the
5369                 # current best one, prefer the new one.
5370                 next if $follower_name eq "_";
5371                 next if length $follower_name > length $return;
5372                 $return = $follower_name;
5373             }
5374             $short_name{$addr} = $return if defined $return;
5375         }
5376
5377         # If no suitable external name return undef
5378         if (! defined $short_name{$addr}) {
5379             $$nominal_length_ptr = undef if $nominal_length_ptr;
5380             return;
5381         }
5382
5383         # Don't allow a null short name.
5384         if ($short_name{$addr} eq "") {
5385             $short_name{$addr} = '_';
5386             $nominal_short_name_length{$addr} = 1;
5387         }
5388
5389         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5390
5391         if ($nominal_length_ptr) {
5392             $$nominal_length_ptr = $nominal_short_name_length{$addr};
5393         }
5394         return $short_name{$addr};
5395     }
5396
5397     sub external_name {
5398         # Returns the external name that this table should be known by.  This
5399         # is usually the short_name, but not if the short_name is undefined,
5400         # in which case the external_name is arbitrarily set to the
5401         # underscore.
5402
5403         my $self = shift;
5404         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5405
5406         my $short = $self->short_name;
5407         return $short if defined $short;
5408
5409         return '_';
5410     }
5411
5412     sub add_description { # Adds the parameter as a short description.
5413
5414         my $self = shift;
5415         my $description = shift;
5416         chomp $description;
5417         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5418
5419         no overloading;
5420         push @{$description{pack 'J', $self}}, $description;
5421
5422         return;
5423     }
5424
5425     sub add_note { # Adds the parameter as a short note.
5426
5427         my $self = shift;
5428         my $note = shift;
5429         chomp $note;
5430         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5431
5432         no overloading;
5433         push @{$note{pack 'J', $self}}, $note;
5434
5435         return;
5436     }
5437
5438     sub add_comment { # Adds the parameter as a comment.
5439
5440         return unless $debugging_build;
5441
5442         my $self = shift;
5443         my $comment = shift;
5444         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5445
5446         chomp $comment;
5447
5448         no overloading;
5449         push @{$comment{pack 'J', $self}}, $comment;
5450
5451         return;
5452     }
5453
5454     sub comment {
5455         # Return the current comment for this table.  If called in list
5456         # context, returns the array of comments.  In scalar, returns a string
5457         # of each element joined together with a period ending each.
5458
5459         my $self = shift;
5460         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5461
5462         my $addr = do { no overloading; pack 'J', $self; };
5463         my @list = @{$comment{$addr}};
5464         return @list if wantarray;
5465         my $return = "";
5466         foreach my $sentence (@list) {
5467             $return .= '.  ' if $return;
5468             $return .= $sentence;
5469             $return =~ s/\.$//;
5470         }
5471         $return .= '.' if $return;
5472         return $return;
5473     }
5474
5475     sub initialize {
5476         # Initialize the table with the argument which is any valid
5477         # initialization for range lists.
5478
5479         my $self = shift;
5480         my $addr = do { no overloading; pack 'J', $self; };
5481         my $initialization = shift;
5482         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5483
5484         # Replace the current range list with a new one of the same exact
5485         # type.
5486         my $class = ref $range_list{$addr};
5487         $range_list{$addr} = $class->new(Owner => $self,
5488                                         Initialize => $initialization);
5489         return;
5490
5491     }
5492
5493     sub header {
5494         # The header that is output for the table in the file it is written
5495         # in.
5496
5497         my $self = shift;
5498         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5499
5500         my $return = "";
5501         $return .= $DEVELOPMENT_ONLY if $compare_versions;
5502         $return .= $HEADER;
5503         return $return;
5504     }
5505
5506     sub write {
5507         # Write a representation of the table to its file.  It calls several
5508         # functions furnished by sub-classes of this abstract base class to
5509         # handle non-normal ranges, to add stuff before the table, and at its
5510         # end.  If the table is to be written so that adjustments are
5511         # required, this does that conversion.
5512
5513         my $self = shift;
5514         my $use_adjustments = shift; # ? output in adjusted format or not
5515         my $tab_stops = shift;       # The number of tab stops over to put any
5516                                      # comment.
5517         my $suppress_value = shift;  # Optional, if the value associated with
5518                                      # a range equals this one, don't write
5519                                      # the range
5520         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5521
5522         my $addr = do { no overloading; pack 'J', $self; };
5523
5524         # Start with the header
5525         my @HEADER = $self->header;
5526
5527         # Then the comments
5528         push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5529                                                         if $comment{$addr};
5530
5531         # Things discovered processing the main body of the document may
5532         # affect what gets output before it, therefore pre_body() isn't called
5533         # until after all other processing of the table is done.
5534
5535         # The main body looks like a 'here' document.  If there are comments,
5536         # get rid of them when processing it.
5537         my @OUT;
5538         if ($annotate || $output_range_counts) {
5539             # Use the line below in Perls that don't have /r
5540             #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5541             push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5542         } else {
5543             push @OUT, "return <<'END';\n";
5544         }
5545
5546         if ($range_list{$addr}->is_empty) {
5547
5548             # This is a kludge for empty tables to silence a warning in
5549             # utf8.c, which can't really deal with empty tables, but it can
5550             # deal with a table that matches nothing, as the inverse of 'Any'
5551             # does.
5552             push @OUT, "!utf8::Any\n";
5553         }
5554         elsif ($self->name eq 'N'
5555
5556                # To save disk space and table cache space, avoid putting out
5557                # binary N tables, but instead create a file which just inverts
5558                # the Y table.  Since the file will still exist and occupy a
5559                # certain number of blocks, might as well output the whole
5560                # thing if it all will fit in one block.   The number of
5561                # ranges below is an approximate number for that.
5562                && ($self->property->type == $BINARY
5563                    || $self->property->type == $FORCED_BINARY)
5564                # && $self->property->tables == 2  Can't do this because the
5565                #        non-binary properties, like NFDQC aren't specifiable
5566                #        by the notation
5567                && $range_list{$addr}->ranges > 15
5568                && ! $annotate)  # Under --annotate, want to see everything
5569         {
5570             push @OUT, "!utf8::" . $self->property->name . "\n";
5571         }
5572         else {
5573             my $range_size_1 = $range_size_1{$addr};
5574             my $format;            # Used only in $annotate option
5575             my $include_name;      # Used only in $annotate option
5576
5577             if ($annotate) {
5578
5579                 # If annotating each code point, must print 1 per line.
5580                 # The variable could point to a subroutine, and we don't want
5581                 # to lose that fact, so only set if not set already
5582                 $range_size_1 = 1 if ! $range_size_1;
5583
5584                 $format = $self->format;
5585
5586                 # The name of the character is output only for tables that
5587                 # don't already include the name in the output.
5588                 my $property = $self->property;
5589                 $include_name =
5590                     !  ($property == $perl_charname
5591                         || $property == main::property_ref('Unicode_1_Name')
5592                         || $property == main::property_ref('Name')
5593                         || $property == main::property_ref('Name_Alias')
5594                        );
5595             }
5596
5597             # Values for previous time through the loop.  Initialize to
5598             # something that won't be adjacent to the first iteration;
5599             # only $previous_end matters for that.
5600             my $previous_start;
5601             my $previous_end = -2;
5602             my $previous_value;
5603
5604             # Values for next time through the portion of the loop that splits
5605             # the range.  0 in $next_start means there is no remaining portion
5606             # to deal with.
5607             my $next_start = 0;
5608             my $next_end;
5609             my $next_value;
5610             my $offset = 0;
5611
5612             my $output_value_in_hex = $self->isa('Map_Table')
5613                                       && $self->format eq $HEX_ADJUST_FORMAT;
5614             # Use leading zeroes just for files whose format should not be
5615             # changed from what it has been.  Otherwise, they just take up
5616             # space and time to process.
5617             my $hex_format = ($self->isa('Map_Table')
5618                               && $self->to_output_map == $EXTERNAL_MAP)
5619                              ? "%04X"
5620                              : "%X";
5621
5622             # Output each range as part of the here document.
5623             RANGE:
5624             for my $set ($range_list{$addr}->ranges) {
5625                 if ($set->type != 0) {
5626                     $self->handle_special_range($set);
5627                     next RANGE;
5628                 }
5629                 my $start = $set->start;
5630                 my $end   = $set->end;
5631                 my $value  = $set->value;
5632
5633                 # Don't output ranges whose value is the one to suppress
5634                 next RANGE if defined $suppress_value
5635                               && $value eq $suppress_value;
5636
5637                 {   # This bare block encloses the scope where we may need to
5638                     # 'redo' to.  Consider the table that contains the
5639                     # lowercasing maps.  mktables stores the ASCII range ones
5640                     # as 26 ranges:
5641                     #       ord('A') => ord('a'), .. ord('Z') => ord('z')
5642                     # For compactness, the table that gets written has this as
5643                     # just one range
5644                     #       ( ord('A') .. ord('Z') ) => ord('a')
5645                     # and the software that reads the tables is smart enough
5646                     # to "connect the dots".  This change is accomplished in
5647                     # this loop by looking to see if the current iteration
5648                     # fits the paradigm of the previous iteration, and if so,
5649                     # we merge them by replacing the final output item with
5650                     # the merged data.  Repeated 25 times, this gets A-Z.  But
5651                     # we also have to make sure we don't screw up cases where
5652                     # we have internally stored
5653                     #       ( 0x1C4 .. 0x1C6 ) => 0x1C5
5654                     # This single internal range has to be output as 3 ranges.
5655                     # (There are very few of these, so the gain of doing the
5656                     # combining of other ranges far outweighs the splitting of
5657                     # these.)  To accomplish this, we have to split the range,
5658                     # and each time through we handle the next portion of the
5659                     # original by ending this block with a 'redo'.   The
5660                     # values to use for that next time through are set up just
5661                     # below in the scalars whose names begin with '$next_'.
5662
5663                     if ($use_adjustments && ! $range_size_1) {
5664
5665                         # When converting to use adjustments, we can handle
5666                         # only single element ranges.  Set up so that this
5667                         # time through the loop, we look at the first element,
5668                         # and the next time through, we start off with the
5669                         # remainder.  Thus each time through we look at the
5670                         # first element of the range
5671                         if ($end != $start) {
5672                             $next_start = $start + 1;
5673                             $next_end = $end;
5674                             $next_value = $value;
5675                             $end = $start;
5676                         }
5677
5678                         # The values for some of these tables are stored as
5679                         # hex strings.  Convert those to decimal
5680                         $value = hex($value)
5681                                     if $self->default_map eq $CODE_POINT
5682                                         && $value =~ / ^ [A-Fa-f0-9]+ $ /x;
5683
5684                         # If this range is adjacent to the previous one, and
5685                         # the values in each are integers that are also
5686                         # adjacent (differ by 1), then this range really
5687                         # extends the previous one that is already in element
5688                         # $OUT[-1].  So we pop that element, and pretend that
5689                         # the range starts with whatever it started with.
5690                         # $offset is incremented by 1 each time so that it
5691                         # gives the current offset from the first element in
5692                         # the accumulating range, and we keep in $value the
5693                         # value of that first element.
5694                         if ($start == $previous_end + 1
5695                             && $value =~ /^ -? \d+ $/xa
5696                             && $previous_value =~ /^ -? \d+ $/xa
5697                             && ($value == ($previous_value + ++$offset)))
5698                         {
5699                             pop @OUT;
5700                             $start = $previous_start;
5701                             $value = $previous_value;
5702                         }
5703                         else {
5704                             $offset = 0;
5705                         }
5706
5707                         # Save the current values for the next time through
5708                         # the loop.
5709                         $previous_start = $start;
5710                         $previous_end = $end;
5711                         $previous_value = $value;
5712                     }
5713
5714                     # If there is a range and doesn't need a single point range
5715                     # output
5716                     if ($start != $end && ! $range_size_1) {
5717                         push @OUT, sprintf "$hex_format\t$hex_format",
5718                                              $start,       $end;
5719                         if ($value ne "") {
5720                             if ($output_value_in_hex) {
5721                                 $OUT[-1] .= sprintf "\t$hex_format", $value;
5722                             }
5723                             else {
5724                                 $OUT[-1] .= "\t$value";
5725                             }
5726                         }
5727
5728                         # Add a comment with the size of the range, if
5729                         # requested.  Expand Tabs to make sure they all start
5730                         # in the same column, and then unexpand to use mostly
5731                         # tabs.
5732                         if (! $output_range_counts{$addr}) {
5733                             $OUT[-1] .= "\n";
5734                         }
5735                         else {
5736                             $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5737                             my $count = main::clarify_number($end - $start + 1);
5738                             use integer;
5739
5740                             my $width = $tab_stops * 8 - 1;
5741                             $OUT[-1] = sprintf("%-*s # [%s]\n",
5742                                                 $width,
5743                                                 $OUT[-1],
5744                                                 $count);
5745                             $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5746                         }
5747                     }
5748
5749                         # Here to output a single code point per line.
5750                         # If not to annotate, use the simple formats
5751                     elsif (! $annotate) {
5752
5753                         # Use any passed in subroutine to output.
5754                         if (ref $range_size_1 eq 'CODE') {
5755                             for my $i ($start .. $end) {
5756                                 push @OUT, &{$range_size_1}($i, $value);
5757                             }
5758                         }
5759                         else {
5760
5761                             # Here, caller is ok with default output.
5762                             for (my $i = $start; $i <= $end; $i++) {
5763                                 if ($output_value_in_hex) {
5764                                     push @OUT,
5765                                         sprintf "$hex_format\t\t$hex_format\n",
5766                                                  $i,            $value;
5767                                 }
5768                                 else {
5769                                     push @OUT, sprintf $hex_format, $i;
5770                                     $OUT[-1] .= "\t\t$value" if $value ne "";
5771                                     $OUT[-1] .= "\n";
5772                                 }
5773                             }
5774                         }
5775                     }
5776                     else {
5777
5778                         # Here, wants annotation.
5779                         for (my $i = $start; $i <= $end; $i++) {
5780
5781                             # Get character information if don't have it already
5782                             main::populate_char_info($i)
5783                                                 if ! defined $viacode[$i];
5784                             my $type = $annotate_char_type[$i];
5785
5786                             # Figure out if should output the next code points
5787                             # as part of a range or not.  If this is not in an
5788                             # annotation range, then won't output as a range,
5789                             # so returns $i.  Otherwise use the end of the
5790                             # annotation range, but no further than the
5791                             # maximum possible end point of the loop.
5792                             my $range_end = main::min(
5793                                         $annotate_ranges->value_of($i) || $i,
5794                                         $end);
5795
5796                             # Use a range if it is a range, and either is one
5797                             # of the special annotation ranges, or the range
5798                             # is at most 3 long.  This last case causes the
5799                             # algorithmically named code points to be output
5800                             # individually in spans of at most 3, as they are
5801                             # the ones whose $type is > 0.
5802                             if ($range_end != $i
5803                                 && ( $type < 0 || $range_end - $i > 2))
5804                             {
5805                                 # Here is to output a range.  We don't allow a
5806                                 # caller-specified output format--just use the
5807                                 # standard one.
5808                                 push @OUT, sprintf
5809                                             "$hex_format\t$hex_format\t%s\t#",
5810                                               $i,         $range_end,  $value;
5811                                 my $range_name = $viacode[$i];
5812
5813                                 # For the code points which end in their hex
5814                                 # value, we eliminate that from the output
5815                                 # annotation, and capitalize only the first
5816                                 # letter of each word.
5817                                 if ($type == $CP_IN_NAME) {
5818                                     my $hex = sprintf $hex_format, $i;
5819                                     $range_name =~ s/-$hex$//;
5820                                     my @words = split " ", $range_name;
5821                                     for my $word (@words) {
5822                                         $word =
5823                                           ucfirst(lc($word)) if $word ne 'CJK';
5824                                     }
5825                                     $range_name = join " ", @words;
5826                                 }
5827                                 elsif ($type == $HANGUL_SYLLABLE) {
5828                                     $range_name = "Hangul Syllable";
5829                                 }
5830
5831                                 $OUT[-1] .= " $range_name" if $range_name;
5832
5833                                 # Include the number of code points in the
5834                                 # range
5835                                 my $count =
5836                                     main::clarify_number($range_end - $i + 1);
5837                                 $OUT[-1] .= " [$count]\n";
5838
5839                                 # Skip to the end of the range
5840                                 $i = $range_end;
5841                             }
5842                             else { # Not in a range.
5843                                 my $comment = "";
5844
5845                                 # When outputting the names of each character,
5846                                 # use the character itself if printable
5847                                 $comment .= "'" . chr($i) . "' "
5848                                                             if $printable[$i];
5849
5850                                 # To make it more readable, use a minimum
5851                                 # indentation
5852                                 my $comment_indent;
5853
5854                                 my $output_value = $value;
5855
5856                                 # Determine the annotation
5857                                 if ($format eq $DECOMP_STRING_FORMAT) {
5858
5859                                     # This is very specialized, with the type
5860                                     # of decomposition beginning the line
5861                                     # enclosed in <...>, and the code points
5862                                     # that the code point decomposes to
5863                                     # separated by blanks.  Create two
5864                                     # strings, one of the printable
5865                                     # characters, and one of their official
5866                                     # names.
5867                                     (my $map = $output_value)
5868                                                     =~ s/ \ * < .*? > \ +//x;
5869                                     my $tostr = "";
5870                                     my $to_name = "";
5871                                     my $to_chr = "";
5872                                     foreach my $to (split " ", $map) {
5873                                         $to = CORE::hex $to;
5874                                         $to_name .= " + " if $to_name;
5875                                         $to_chr .= chr($to);
5876                                         main::populate_char_info($to)
5877                                                     if ! defined $viacode[$to];
5878                                         $to_name .=  $viacode[$to];
5879                                     }
5880
5881                                     $comment .=
5882                                     "=> '$to_chr'; $viacode[$i] => $to_name";
5883                                     $comment_indent = 25;   # Determined by
5884                                                             # experiment
5885                                 }
5886                                 else {
5887                                     $output_value = CORE::hex $value
5888                                            if $format eq $HEX_FORMAT
5889                                               || $format eq $HEX_ADJUST_FORMAT;
5890                                     $output_value += $offset
5891                                                    if $use_adjustments
5892                                                        # Don't try to adjust a
5893                                                        # non-integer
5894                                                    && $output_value !~ /[-\D]/;
5895
5896                                     # Assume that any table that has hex
5897                                     # format is a mapping of one code point to
5898                                     # another.
5899                                     if ($format eq $HEX_FORMAT
5900                                         || $format eq $HEX_ADJUST_FORMAT)
5901                                     {
5902                                         main::populate_char_info($output_value)
5903                                         if ! defined $viacode[$output_value];
5904                                         $comment .= "=> '"
5905                                         . chr($output_value)
5906                                         . "'; " if $printable[$output_value];
5907                                     }
5908                                     $comment .= $viacode[$i] if $include_name
5909                                                             && $viacode[$i];
5910                                     if ($format eq $HEX_FORMAT
5911                                         || $format eq $HEX_ADJUST_FORMAT)
5912                                     {
5913                                         $comment .=
5914                                             " => $viacode[$output_value]"
5915                                                 if $viacode[$output_value];
5916                                     }
5917
5918                                     $output_value = sprintf($hex_format,
5919                                                             $output_value)
5920                                         if  $format eq $HEX_ADJUST_FORMAT
5921                                             || ($format eq $HEX_FORMAT
5922                                                 && $self->replacement_property);
5923
5924                                     # If including the name, no need to
5925                                     # indent, as the name will already be way
5926                                     # across the line.
5927                                     $comment_indent = ($include_name) ? 0 : 60;
5928                                 }
5929
5930                                 # Use any passed in routine to output the base
5931                                 # part of the line.
5932                                 if (ref $range_size_1 eq 'CODE') {
5933                                     my $base_part=&{$range_size_1}
5934                                                         ($i, $output_value);
5935                                     chomp $base_part;
5936                                     push @OUT, $base_part;
5937                                 }
5938                                 else {
5939                                     push @OUT, sprintf "$hex_format\t\t%s",
5940                                                         $i, $output_value;
5941                                 }
5942
5943                                 # And add the annotation.
5944                                 $OUT[-1] = sprintf "%-*s\t# %s",
5945                                                    $comment_indent,
5946                                                    $OUT[-1],
5947                                                    $comment
5948                                             if $comment;
5949                                 $OUT[-1] .= "\n";
5950                             }
5951                         }
5952                     }
5953
5954                     # If we split the range, set up so the next time through
5955                     # we get the remainder, and redo.
5956                     if ($next_start) {
5957                         $start = $next_start;
5958                         $end = $next_end;
5959                         $value = $next_value;
5960                         $next_start = 0;
5961                         redo;
5962                     }
5963                 }
5964             } # End of loop through all the table's ranges
5965         }
5966
5967         # Add anything that goes after the main body, but within the here
5968         # document,
5969         my $append_to_body = $self->append_to_body;
5970         push @OUT, $append_to_body if $append_to_body;
5971
5972         # And finish the here document.
5973         push @OUT, "END\n";
5974
5975         # Done with the main portion of the body.  Can now figure out what
5976         # should appear before it in the file.
5977         my $pre_body = $self->pre_body;
5978         push @HEADER, $pre_body, "\n" if $pre_body;
5979
5980         # All these files should have a .pl suffix added to them.
5981         my @file_with_pl = @{$file_path{$addr}};
5982         $file_with_pl[-1] .= '.pl';
5983
5984         main::write(\@file_with_pl,
5985                     $annotate,      # utf8 iff annotating
5986                     \@HEADER,
5987                     \@OUT);
5988         return;
5989     }
5990
5991     sub set_status {    # Set the table's status
5992         my $self = shift;
5993         my $status = shift; # The status enum value
5994         my $info = shift;   # Any message associated with it.
5995         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5996
5997         my $addr = do { no overloading; pack 'J', $self; };
5998
5999         $status{$addr} = $status;
6000         $status_info{$addr} = $info;
6001         return;
6002     }
6003
6004     sub set_fate {  # Set the fate of a table
6005         my $self = shift;
6006         my $fate = shift;
6007         my $reason = shift;
6008         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6009
6010         my $addr = do { no overloading; pack 'J', $self; };
6011
6012         return if $fate{$addr} == $fate;    # If no-op
6013
6014         # Can only change the ordinary fate, except if going to $MAP_PROXIED
6015         return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6016
6017         $fate{$addr} = $fate;
6018
6019         # Don't document anything to do with a non-normal fated table
6020         if ($fate != $ORDINARY) {
6021             my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6022             foreach my $alias ($self->aliases) {
6023                 $alias->set_ucd($put_in_pod);
6024
6025                 # MAP_PROXIED doesn't affect the match tables
6026                 next if $fate == $MAP_PROXIED;
6027                 $alias->set_make_re_pod_entry($put_in_pod);
6028             }
6029         }
6030
6031         # Save the reason for suppression for output
6032         if ($fate == $SUPPRESSED && defined $reason) {
6033             $why_suppressed{$complete_name{$addr}} = $reason;
6034         }
6035
6036         return;
6037     }
6038
6039     sub lock {
6040         # Don't allow changes to the table from now on.  This stores a stack
6041         # trace of where it was called, so that later attempts to modify it
6042         # can immediately show where it got locked.
6043
6044         my $self = shift;
6045         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6046
6047         my $addr = do { no overloading; pack 'J', $self; };
6048
6049         $locked{$addr} = "";
6050
6051         my $line = (caller(0))[2];
6052         my $i = 1;
6053
6054         # Accumulate the stack trace
6055         while (1) {
6056             my ($pkg, $file, $caller_line, $caller) = caller $i++;
6057
6058             last unless defined $caller;
6059
6060             $locked{$addr} .= "    called from $caller() at line $line\n";
6061             $line = $caller_line;
6062         }
6063         $locked{$addr} .= "    called from main at line $line\n";
6064
6065         return;
6066     }
6067
6068     sub carp_if_locked {
6069         # Return whether a table is locked or not, and, by the way, complain
6070         # if is locked
6071
6072         my $self = shift;
6073         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6074
6075         my $addr = do { no overloading; pack 'J', $self; };
6076
6077         return 0 if ! $locked{$addr};
6078         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6079         return 1;
6080     }
6081
6082     sub set_file_path { # Set the final directory path for this table
6083         my $self = shift;
6084         # Rest of parameters passed on
6085
6086         no overloading;
6087         @{$file_path{pack 'J', $self}} = @_;
6088         return
6089     }
6090
6091     # Accessors for the range list stored in this table.  First for
6092     # unconditional
6093     for my $sub (qw(
6094                     containing_range
6095                     contains
6096                     count
6097                     each_range
6098                     hash
6099                     is_empty
6100                     matches_identically_to
6101                     max
6102                     min
6103                     range_count
6104                     reset_each_range
6105                     type_of
6106                     value_of
6107                 ))
6108     {
6109         no strict "refs";
6110         *$sub = sub {
6111             use strict "refs";
6112             my $self = shift;
6113             return $self->_range_list->$sub(@_);
6114         }
6115     }
6116
6117     # Then for ones that should fail if locked
6118     for my $sub (qw(
6119                     delete_range
6120                 ))
6121     {
6122         no strict "refs";
6123         *$sub = sub {
6124             use strict "refs";
6125             my $self = shift;
6126
6127             return if $self->carp_if_locked;
6128             no overloading;
6129             return $self->_range_list->$sub(@_);
6130         }
6131     }
6132
6133 } # End closure
6134
6135 package Map_Table;
6136 use parent '-norequire', '_Base_Table';
6137
6138 # A Map Table is a table that contains the mappings from code points to
6139 # values.  There are two weird cases:
6140 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6141 #    are written in the table's file at the end of the table nonetheless.  It
6142 #    requires specially constructed code to handle these; utf8.c can not read
6143 #    these in, so they should not go in $map_directory.  As of this writing,
6144 #    the only case that these happen is for named sequences used in
6145 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
6146 #    something else could come along that uses it.
6147 # 2) Specials are anything that doesn't fit syntactically into the body of the
6148 #    table.  The ranges for these have a map type of non-zero.  The code below
6149 #    knows about and handles each possible type.   In most cases, these are
6150 #    written as part of the header.
6151 #
6152 # A map table deliberately can't be manipulated at will unlike match tables.
6153 # This is because of the ambiguities having to do with what to do with
6154 # overlapping code points.  And there just isn't a need for those things;
6155 # what one wants to do is just query, add, replace, or delete mappings, plus
6156 # write the final result.
6157 # However, there is a method to get the list of possible ranges that aren't in
6158 # this table to use for defaulting missing code point mappings.  And,
6159 # map_add_or_replace_non_nulls() does allow one to add another table to this
6160 # one, but it is clearly very specialized, and defined that the other's
6161 # non-null values replace this one's if there is any overlap.
6162
6163 sub trace { return main::trace(@_); }
6164
6165 { # Closure
6166
6167     main::setup_package();
6168
6169     my %default_map;
6170     # Many input files omit some entries; this gives what the mapping for the
6171     # missing entries should be
6172     main::set_access('default_map', \%default_map, 'r');
6173
6174     my %anomalous_entries;
6175     # Things that go in the body of the table which don't fit the normal
6176     # scheme of things, like having a range.  Not much can be done with these
6177     # once there except to output them.  This was created to handle named
6178     # sequences.
6179     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6180     main::set_access('anomalous_entries',       # Append singular, read plural
6181                     \%anomalous_entries,
6182                     'readable_array');
6183
6184     my %replacement_property;
6185     # Certain files are unused by Perl itself, and are kept only for backwards
6186     # compatibility for programs that used them before Unicode::UCD existed.
6187     # These are termed legacy properties.  At some point they may be removed,
6188     # but for now mark them as legacy.  If non empty, this is the name of the
6189     # property to use instead (i.e., the modern equivalent).
6190     main::set_access('replacement_property', \%replacement_property, 'r');
6191
6192     my %to_output_map;
6193     # Enum as to whether or not to write out this map table, and how:
6194     #   0               don't output
6195     #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
6196     #                   it should not be removed nor its format changed.  This
6197     #                   is done for those files that have traditionally been
6198     #                   output.  Maps of legacy-only properties default to
6199     #                   this.
6200     #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
6201     #                   with this file
6202     #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6203     #                   outputting the actual mappings as-is, we adjust things
6204     #                   to create a much more compact table. Only those few
6205     #                   tables where the mapping is convertible at least to an
6206     #                   integer and compacting makes a big difference should
6207     #                   have this.  Hence, the default is to not do this
6208     #                   unless the table's default mapping is to $CODE_POINT,
6209     #                   and the range size is not 1.
6210     main::set_access('to_output_map', \%to_output_map, 's');
6211
6212     sub new {
6213         my $class = shift;
6214         my $name = shift;
6215
6216         my %args = @_;
6217
6218         # Optional initialization data for the table.
6219         my $initialize = delete $args{'Initialize'};
6220
6221         my $default_map = delete $args{'Default_Map'};
6222         my $property = delete $args{'_Property'};
6223         my $full_name = delete $args{'Full_Name'};
6224         my $replacement_property = delete $args{'Replacement_Property'} // "";
6225         my $to_output_map = delete $args{'To_Output_Map'};
6226
6227         # Rest of parameters passed on; legacy properties have several common
6228         # other attributes
6229         if ($replacement_property) {
6230             $args{"Fate"} = $LEGACY_ONLY;
6231             $args{"Range_Size_1"} = 1;
6232             $args{"Perl_Extension"} = 1;
6233             $args{"UCD"} = 0;
6234         }
6235
6236         my $range_list = Range_Map->new(Owner => $property);
6237
6238         my $self = $class->SUPER::new(
6239                                     Name => $name,
6240                                     Complete_Name =>  $full_name,
6241                                     Full_Name => $full_name,
6242                                     _Property => $property,
6243                                     _Range_List => $range_list,
6244                                     %args);
6245
6246         my $addr = do { no overloading; pack 'J', $self; };
6247
6248         $anomalous_entries{$addr} = [];
6249         $default_map{$addr} = $default_map;
6250         $replacement_property{$addr} = $replacement_property;
6251         $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map
6252                                           && $replacement_property;
6253         $to_output_map{$addr} = $to_output_map;
6254
6255         $self->initialize($initialize) if defined $initialize;
6256
6257         return $self;
6258     }
6259
6260     use overload
6261         fallback => 0,
6262         qw("") => "_operator_stringify",
6263     ;
6264
6265     sub _operator_stringify {
6266         my $self = shift;
6267
6268         my $name = $self->property->full_name;
6269         $name = '""' if $name eq "";
6270         return "Map table for Property '$name'";
6271     }
6272
6273     sub add_alias {
6274         # Add a synonym for this table (which means the property itself)
6275         my $self = shift;
6276         my $name = shift;
6277         # Rest of parameters passed on.
6278
6279         $self->SUPER::add_alias($name, $self->property, @_);
6280         return;
6281     }
6282
6283     sub add_map {
6284         # Add a range of code points to the list of specially-handled code
6285         # points.  $MULTI_CP is assumed if the type of special is not passed
6286         # in.
6287
6288         my $self = shift;
6289         my $lower = shift;
6290         my $upper = shift;
6291         my $string = shift;
6292         my %args = @_;
6293
6294         my $type = delete $args{'Type'} || 0;
6295         # Rest of parameters passed on
6296
6297         # Can't change the table if locked.
6298         return if $self->carp_if_locked;
6299
6300         my $addr = do { no overloading; pack 'J', $self; };
6301
6302         $self->_range_list->add_map($lower, $upper,
6303                                     $string,
6304                                     @_,
6305                                     Type => $type);
6306         return;
6307     }
6308
6309     sub append_to_body {
6310         # Adds to the written HERE document of the table's body any anomalous
6311         # entries in the table..
6312
6313         my $self = shift;
6314         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6315
6316         my $addr = do { no overloading; pack 'J', $self; };
6317
6318         return "" unless @{$anomalous_entries{$addr}};
6319         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6320     }
6321
6322     sub map_add_or_replace_non_nulls {
6323         # This adds the mappings in the table $other to $self.  Non-null
6324         # mappings from $other override those in $self.  It essentially merges
6325         # the two tables, with the second having priority except for null
6326         # mappings.
6327
6328         my $self = shift;
6329         my $other = shift;
6330         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6331
6332         return if $self->carp_if_locked;
6333
6334         if (! $other->isa(__PACKAGE__)) {
6335             Carp::my_carp_bug("$other should be a "
6336                         . __PACKAGE__
6337                         . ".  Not a '"
6338                         . ref($other)
6339                         . "'.  Not added;");
6340             return;
6341         }
6342
6343         my $addr = do { no overloading; pack 'J', $self; };
6344         my $other_addr = do { no overloading; pack 'J', $other; };
6345
6346         local $to_trace = 0 if main::DEBUG;
6347
6348         my $self_range_list = $self->_range_list;
6349         my $other_range_list = $other->_range_list;
6350         foreach my $range ($other_range_list->ranges) {
6351             my $value = $range->value;
6352             next if $value eq "";
6353             $self_range_list->_add_delete('+',
6354                                           $range->start,
6355                                           $range->end,
6356                                           $value,
6357                                           Type => $range->type,
6358                                           Replace => $UNCONDITIONALLY);
6359         }
6360
6361         return;
6362     }
6363
6364     sub set_default_map {
6365         # Define what code points that are missing from the input files should
6366         # map to
6367
6368         my $self = shift;
6369         my $map = shift;
6370         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6371
6372         my $addr = do { no overloading; pack 'J', $self; };
6373
6374         # Convert the input to the standard equivalent, if any (won't have any
6375         # for $STRING properties)
6376         my $standard = $self->_find_table_from_alias->{$map};
6377         $map = $standard->name if defined $standard;
6378
6379         # Warn if there already is a non-equivalent default map for this
6380         # property.  Note that a default map can be a ref, which means that
6381         # what it actually means is delayed until later in the program, and it
6382         # IS permissible to override it here without a message.
6383         my $default_map = $default_map{$addr};
6384         if (defined $default_map
6385             && ! ref($default_map)
6386             && $default_map ne $map
6387             && main::Standardize($map) ne $default_map)
6388         {
6389             my $property = $self->property;
6390             my $map_table = $property->table($map);
6391             my $default_table = $property->table($default_map);
6392             if (defined $map_table
6393                 && defined $default_table
6394                 && $map_table != $default_table)
6395             {
6396                 Carp::my_carp("Changing the default mapping for "
6397                             . $property
6398                             . " from $default_map to $map'");
6399             }
6400         }
6401
6402         $default_map{$addr} = $map;
6403
6404         # Don't also create any missing table for this map at this point,
6405         # because if we did, it could get done before the main table add is
6406         # done for PropValueAliases.txt; instead the caller will have to make
6407         # sure it exists, if desired.
6408         return;
6409     }
6410
6411     sub to_output_map {
6412         # Returns boolean: should we write this map table?
6413
6414         my $self = shift;
6415         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6416
6417         my $addr = do { no overloading; pack 'J', $self; };
6418
6419         # If overridden, use that
6420         return $to_output_map{$addr} if defined $to_output_map{$addr};
6421
6422         my $full_name = $self->full_name;
6423         return $global_to_output_map{$full_name}
6424                                 if defined $global_to_output_map{$full_name};
6425
6426         # If table says to output, do so; if says to suppress it, do so.
6427         my $fate = $self->fate;
6428         return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6429         return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6430         return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6431
6432         my $type = $self->property->type;
6433
6434         # Don't want to output binary map tables even for debugging.
6435         return 0 if $type == $BINARY;
6436
6437         # But do want to output string ones.  All the ones that remain to
6438         # be dealt with (i.e. which haven't explicitly been set to external)
6439         # are for internal Perl use only.  The default for those that map to
6440         # $CODE_POINT and haven't been restricted to a single element range
6441         # is to use the adjusted form.
6442         if ($type == $STRING) {
6443             return $INTERNAL_MAP if $self->range_size_1
6444                                     || $default_map{$addr} ne $CODE_POINT;
6445             return $OUTPUT_ADJUSTED;
6446         }
6447
6448         # Otherwise is an $ENUM, do output it, for Perl's purposes
6449         return $INTERNAL_MAP;
6450     }
6451
6452     sub inverse_list {
6453         # Returns a Range_List that is gaps of the current table.  That is,
6454         # the inversion
6455
6456         my $self = shift;
6457         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6458
6459         my $current = Range_List->new(Initialize => $self->_range_list,
6460                                 Owner => $self->property);
6461         return ~ $current;
6462     }
6463
6464     sub header {
6465         my $self = shift;
6466         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6467
6468         my $return = $self->SUPER::header();
6469
6470         if ($self->to_output_map >= $INTERNAL_MAP) {
6471             $return .= $INTERNAL_ONLY_HEADER;
6472         }
6473         else {
6474             my $property_name = $self->property->replacement_property;
6475
6476             # The legacy-only properties were gotten above; but there are some
6477             # other properties whose files are in current use that have fixed
6478             # formats.
6479             $property_name = $self->property->full_name unless $property_name;
6480
6481             $return .= <<END;
6482
6483 # !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
6484
6485 # This file is for internal use by core Perl only.  It is retained for
6486 # backwards compatibility with applications that may have come to rely on it,
6487 # but its format and even its name or existence are subject to change without
6488 # notice in a future Perl version.  Don't use it directly.  Instead, its
6489 # contents are now retrievable through a stable API in the Unicode::UCD
6490 # module: Unicode::UCD::prop_invmap('$property_name').
6491 END
6492         }
6493         return $return;
6494     }
6495
6496     sub set_final_comment {
6497         # Just before output, create the comment that heads the file
6498         # containing this table.
6499
6500         return unless $debugging_build;
6501
6502         my $self = shift;
6503         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6504
6505         # No sense generating a comment if aren't going to write it out.
6506         return if ! $self->to_output_map;
6507
6508         my $addr = do { no overloading; pack 'J', $self; };
6509
6510         my $property = $self->property;
6511
6512         # Get all the possible names for this property.  Don't use any that
6513         # aren't ok for use in a file name, etc.  This is perhaps causing that
6514         # flag to do double duty, and may have to be changed in the future to
6515         # have our own flag for just this purpose; but it works now to exclude
6516         # Perl generated synonyms from the lists for properties, where the
6517         # name is always the proper Unicode one.
6518         my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6519
6520         my $count = $self->count;
6521         my $default_map = $default_map{$addr};
6522
6523         # The ranges that map to the default aren't output, so subtract that
6524         # to get those actually output.  A property with matching tables
6525         # already has the information calculated.
6526         if ($property->type != $STRING) {
6527             $count -= $property->table($default_map)->count;
6528         }
6529         elsif (defined $default_map) {
6530
6531             # But for $STRING properties, must calculate now.  Subtract the
6532             # count from each range that maps to the default.
6533             foreach my $range ($self->_range_list->ranges) {
6534                 if ($range->value eq $default_map) {
6535                     $count -= $range->end +1 - $range->start;
6536                 }
6537             }
6538
6539         }
6540
6541         # Get a  string version of $count with underscores in large numbers,
6542         # for clarity.
6543         my $string_count = main::clarify_number($count);
6544
6545         my $code_points = ($count == 1)
6546                         ? 'single code point'
6547                         : "$string_count code points";
6548
6549         my $mapping;
6550         my $these_mappings;
6551         my $are;
6552         if (@property_aliases <= 1) {
6553             $mapping = 'mapping';
6554             $these_mappings = 'this mapping';
6555             $are = 'is'
6556         }
6557         else {
6558             $mapping = 'synonymous mappings';
6559             $these_mappings = 'these mappings';
6560             $are = 'are'
6561         }
6562         my $cp;
6563         if ($count >= $MAX_UNICODE_CODEPOINTS) {
6564             $cp = "any code point in Unicode Version $string_version";
6565         }
6566         else {
6567             my $map_to;
6568             if ($default_map eq "") {
6569                 $map_to = 'the null string';
6570             }
6571             elsif ($default_map eq $CODE_POINT) {
6572                 $map_to = "itself";
6573             }
6574             else {
6575                 $map_to = "'$default_map'";
6576             }
6577             if ($count == 1) {
6578                 $cp = "the single code point";
6579             }
6580             else {
6581                 $cp = "one of the $code_points";
6582             }
6583             $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6584         }
6585
6586         my $comment = "";
6587
6588         my $status = $self->status;
6589         if ($status ne $NORMAL) {
6590             my $warn = uc $status_past_participles{$status};
6591             $comment .= <<END;
6592
6593 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
6594  All property or property=value combinations contained in this file are $warn.
6595  See $unicode_reference_url for what this means.
6596
6597 END
6598         }
6599         $comment .= "This file returns the $mapping:\n";
6600
6601         my $ucd_accessible_name = "";
6602         my $full_name = $self->property->full_name;
6603         for my $i (0 .. @property_aliases - 1) {
6604             my $name = $property_aliases[$i]->name;
6605             $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
6606             if ($property_aliases[$i]->ucd) {
6607                 if ($name eq $full_name) {
6608                     $ucd_accessible_name = $full_name;
6609                 }
6610                 elsif (! $ucd_accessible_name) {
6611                     $ucd_accessible_name = $name;
6612                 }
6613             }
6614         }
6615         $comment .= "\nwhere 'cp' is $cp.";
6616         if ($ucd_accessible_name) {
6617             $comment .= "  Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
6618         }
6619
6620         # And append any commentary already set from the actual property.
6621         $comment .= "\n\n" . $self->comment if $self->comment;
6622         if ($self->description) {
6623             $comment .= "\n\n" . join " ", $self->description;
6624         }
6625         if ($self->note) {
6626             $comment .= "\n\n" . join " ", $self->note;
6627         }
6628         $comment .= "\n";
6629
6630         if (! $self->perl_extension) {
6631             $comment .= <<END;
6632
6633 For information about what this property really means, see:
6634 $unicode_reference_url
6635 END
6636         }
6637
6638         if ($count) {        # Format differs for empty table
6639                 $comment.= "\nThe format of the ";
6640             if ($self->range_size_1) {
6641                 $comment.= <<END;
6642 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6643 is in hex; MAPPING is what CODE_POINT maps to.
6644 END
6645             }
6646             else {
6647
6648                 # There are tables which end up only having one element per
6649                 # range, but it is not worth keeping track of for making just
6650                 # this comment a little better.
6651                 $comment.= <<END;
6652 non-comment portions of the main body of lines of this file is:
6653 START\\tSTOP\\tMAPPING where START is the starting code point of the
6654 range, in hex; STOP is the ending point, or if omitted, the range has just one
6655 code point; MAPPING is what each code point between START and STOP maps to.
6656 END
6657                 if ($self->output_range_counts) {
6658                     $comment .= <<END;
6659 Numbers in comments in [brackets] indicate how many code points are in the
6660 range (omitted when the range is a single code point or if the mapping is to
6661 the null string).
6662 END
6663                 }
6664             }
6665         }
6666         $self->set_comment(main::join_lines($comment));
6667         return;
6668     }
6669
6670     my %swash_keys; # Makes sure don't duplicate swash names.
6671
6672     # The remaining variables are temporaries used while writing each table,
6673     # to output special ranges.
6674     my @multi_code_point_maps;  # Map is to more than one code point.
6675
6676     sub handle_special_range {
6677         # Called in the middle of write when it finds a range it doesn't know
6678         # how to handle.
6679
6680         my $self = shift;
6681         my $range = shift;
6682         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6683
6684         my $addr = do { no overloading; pack 'J', $self; };
6685
6686         my $type = $range->type;
6687
6688         my $low = $range->start;
6689         my $high = $range->end;
6690         my $map = $range->value;
6691
6692         # No need to output the range if it maps to the default.
6693         return if $map eq $default_map{$addr};
6694
6695         my $property = $self->property;
6696
6697         # Switch based on the map type...
6698         if ($type == $HANGUL_SYLLABLE) {
6699
6700             # These are entirely algorithmically determinable based on
6701             # some constants furnished by Unicode; for now, just set a
6702             # flag to indicate that have them.  After everything is figured
6703             # out, we will output the code that does the algorithm.  (Don't
6704             # output them if not needed because we are suppressing this
6705             # property.)
6706             $has_hangul_syllables = 1 if $property->to_output_map;
6707         }
6708         elsif ($type == $CP_IN_NAME) {
6709
6710             # Code points whose name ends in their code point are also
6711             # algorithmically determinable, but need information about the map
6712             # to do so.  Both the map and its inverse are stored in data
6713             # structures output in the file.  They are stored in the mean time
6714             # in global lists The lists will be written out later into Name.pm,
6715             # which is created only if needed.  In order to prevent duplicates
6716             # in the list, only add to them for one property, should multiple
6717             # ones need them.
6718             if ($needing_code_points_ending_in_code_point == 0) {
6719                 $needing_code_points_ending_in_code_point = $property;
6720             }
6721             if ($property == $needing_code_points_ending_in_code_point) {
6722                 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6723                 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6724
6725                 my $squeezed = $map =~ s/[-\s]+//gr;
6726                 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6727                                                                           $low;
6728                 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6729                                                                          $high;
6730
6731                 push @code_points_ending_in_code_point, { low => $low,
6732                                                         high => $high,
6733                                                         name => $map
6734                                                         };
6735             }
6736         }
6737         elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6738
6739             # Multi-code point maps and null string maps have an entry
6740             # for each code point in the range.  They use the same
6741             # output format.
6742             for my $code_point ($low .. $high) {
6743
6744                 # The pack() below can't cope with surrogates.  XXX This may
6745                 # no longer be true
6746                 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
6747                     Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
6748                     next;
6749                 }
6750
6751                 # Generate the hash entries for these in the form that
6752                 # utf8.c understands.
6753                 my $tostr = "";
6754                 my $to_name = "";
6755                 my $to_chr = "";
6756                 foreach my $to (split " ", $map) {
6757                     if ($to !~ /^$code_point_re$/) {
6758                         Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
6759                         next;
6760                     }
6761                     $tostr .= sprintf "\\x{%s}", $to;
6762                     $to = CORE::hex $to;
6763                     if ($annotate) {
6764                         $to_name .= " + " if $to_name;
6765                         $to_chr .= chr($to);
6766                         main::populate_char_info($to)
6767                                             if ! defined $viacode[$to];
6768                         $to_name .=  $viacode[$to];
6769                     }
6770                 }
6771
6772                 # I (khw) have never waded through this line to
6773                 # understand it well enough to comment it.
6774                 my $utf8 = sprintf(qq["%s" => "$tostr",],
6775                         join("", map { sprintf "\\x%02X", $_ }
6776                             unpack("U0C*", pack("U", $code_point))));
6777
6778                 # Add a comment so that a human reader can more easily
6779                 # see what's going on.
6780                 push @multi_code_point_maps,
6781                         sprintf("%-45s # U+%04X", $utf8, $code_point);
6782                 if (! $annotate) {
6783                     $multi_code_point_maps[-1] .= " => $map";
6784                 }
6785                 else {
6786                     main::populate_char_info($code_point)
6787                                     if ! defined $viacode[$code_point];
6788                     $multi_code_point_maps[-1] .= " '"
6789                         . chr($code_point)
6790                         . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6791                 }
6792             }
6793         }
6794         else {
6795             Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
6796         }
6797
6798         return;
6799     }
6800
6801     sub pre_body {
6802         # Returns the string that should be output in the file before the main
6803         # body of this table.  It isn't called until the main body is
6804         # calculated, saving a pass.  The string includes some hash entries
6805         # identifying the format of the body, and what the single value should
6806         # be for all ranges missing from it.  It also includes any code points
6807         # which have map_types that don't go in the main table.
6808
6809         my $self = shift;
6810         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6811
6812         my $addr = do { no overloading; pack 'J', $self; };
6813
6814         my $name = $self->property->swash_name;
6815
6816         # Currently there is nothing in the pre_body unless a swash is being
6817         # generated.
6818         return unless defined $name;
6819
6820         if (defined $swash_keys{$name}) {
6821             Carp::my_carp(main::join_lines(<<END
6822 Already created a swash name '$name' for $swash_keys{$name}.  This means that
6823 the same name desired for $self shouldn't be used.  Bad News.  This must be
6824 fixed before production use, but proceeding anyway
6825 END
6826             ));
6827         }
6828         $swash_keys{$name} = "$self";
6829
6830         my $pre_body = "";
6831
6832         # Here we assume we were called after have gone through the whole
6833         # file.  If we actually generated anything for each map type, add its
6834         # respective header and trailer
6835         my $specials_name = "";
6836         if (@multi_code_point_maps) {
6837             $specials_name = "utf8::ToSpec$name";
6838             $pre_body .= <<END;
6839
6840 # Some code points require special handling because their mappings are each to
6841 # multiple code points.  These do not appear in the main body, but are defined
6842 # in the hash below.
6843
6844 # Each key is the string of N bytes that together make up the UTF-8 encoding
6845 # for the code point.  (i.e. the same as looking at the code point's UTF-8
6846 # under "use bytes").  Each value is the UTF-8 of the translation, for speed.
6847 \%$specials_name = (
6848 END
6849             $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6850         }
6851
6852         my $format = $self->format;
6853
6854         my $return = "";
6855
6856         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6857         if ($output_adjusted) {
6858             if ($specials_name) {
6859                 $return .= <<END;
6860 # The mappings in the non-hash portion of this file must be modified to get the
6861 # correct values by adding the code point ordinal number to each one that is
6862 # numeric.
6863 END
6864             }
6865             else {
6866                 $return .= <<END;
6867 # The mappings must be modified to get the correct values by adding the code
6868 # point ordinal number to each one that is numeric.
6869 END
6870             }
6871         }
6872
6873         $return .= <<END;
6874
6875 # The name this swash is to be known by, with the format of the mappings in
6876 # the main body of the table, and what all code points missing from this file
6877 # map to.
6878 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6879 END
6880         if ($specials_name) {
6881             $return .= <<END;
6882 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6883 END
6884         }
6885         my $default_map = $default_map{$addr};
6886
6887         # For $CODE_POINT default maps and using adjustments, instead the default
6888         # becomes zero.
6889         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
6890                 .  (($output_adjusted && $default_map eq $CODE_POINT)
6891                    ? "0"
6892                    : $default_map)
6893                 . "';";
6894
6895         if ($default_map eq $CODE_POINT) {
6896             $return .= ' # code point maps to itself';
6897         }
6898         elsif ($default_map eq "") {
6899             $return .= ' # code point maps to the null string';
6900         }
6901         $return .= "\n";
6902
6903         $return .= $pre_body;
6904
6905         return $return;
6906     }
6907
6908     sub write {
6909         # Write the table to the file.
6910
6911         my $self = shift;
6912         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6913
6914         my $addr = do { no overloading; pack 'J', $self; };
6915
6916         # Clear the temporaries
6917         undef @multi_code_point_maps;
6918
6919         # Calculate the format of the table if not already done.
6920         my $format = $self->format;
6921         my $type = $self->property->type;
6922         my $default_map = $self->default_map;
6923         if (! defined $format) {
6924             if ($type == $BINARY) {
6925
6926                 # Don't bother checking the values, because we elsewhere
6927                 # verify that a binary table has only 2 values.
6928                 $format = $BINARY_FORMAT;
6929             }
6930             else {
6931                 my @ranges = $self->_range_list->ranges;
6932
6933                 # default an empty table based on its type and default map
6934                 if (! @ranges) {
6935
6936                     # But it turns out that the only one we can say is a
6937                     # non-string (besides binary, handled above) is when the
6938                     # table is a string and the default map is to a code point
6939                     if ($type == $STRING && $default_map eq $CODE_POINT) {
6940                         $format = $HEX_FORMAT;
6941                     }
6942                     else {
6943                         $format = $STRING_FORMAT;
6944                     }
6945                 }
6946                 else {
6947
6948                     # Start with the most restrictive format, and as we find
6949                     # something that doesn't fit with that, change to the next
6950                     # most restrictive, and so on.
6951                     $format = $DECIMAL_FORMAT;
6952                     foreach my $range (@ranges) {
6953                         next if $range->type != 0;  # Non-normal ranges don't
6954                                                     # affect the main body
6955                         my $map = $range->value;
6956                         if ($map ne $default_map) {
6957                             last if $format eq $STRING_FORMAT;  # already at
6958                                                                 # least
6959                                                                 # restrictive
6960                             $format = $INTEGER_FORMAT
6961                                                 if $format eq $DECIMAL_FORMAT
6962                                                     && $map !~ / ^ [0-9] $ /x;
6963                             $format = $FLOAT_FORMAT
6964                                             if $format eq $INTEGER_FORMAT
6965                                                 && $map !~ / ^ -? [0-9]+ $ /x;
6966                             $format = $RATIONAL_FORMAT
6967                                 if $format eq $FLOAT_FORMAT
6968                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6969                             $format = $HEX_FORMAT
6970                                 if ($format eq $RATIONAL_FORMAT
6971                                        && $map !~
6972                                            m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
6973                                         # Assume a leading zero means hex,
6974                                         # even if all digits are 0-9
6975                                     || ($format eq $INTEGER_FORMAT
6976                                         && $map =~ /^0[0-9A-F]/);
6977                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6978                                                        && $map =~ /[^0-9A-F]/;
6979                         }
6980                     }
6981                 }
6982             }
6983         } # end of calculating format
6984
6985         if ($default_map eq $CODE_POINT
6986             && $format ne $HEX_FORMAT
6987             && ! defined $self->format)    # manual settings are always
6988                                            # considered ok
6989         {
6990             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6991         }
6992
6993         # If the output is to be adjusted, the format of the table that gets
6994         # output is actually 'a' or 'ax' instead of whatever it is stored
6995         # internally as.
6996         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6997         if ($output_adjusted) {
6998             if ($default_map eq $CODE_POINT) {
6999                 $format = $HEX_ADJUST_FORMAT;
7000             }
7001             else {
7002                 $format = $ADJUST_FORMAT;
7003             }
7004         }
7005
7006         $self->_set_format($format);
7007
7008         return $self->SUPER::write(
7009             $output_adjusted,
7010             ($self->property == $block)
7011                 ? 7     # block file needs more tab stops
7012                 : 3,
7013             $default_map);   # don't write defaulteds
7014     }
7015
7016     # Accessors for the underlying list that should fail if locked.
7017     for my $sub (qw(
7018                     add_duplicate
7019                 ))
7020     {
7021         no strict "refs";
7022         *$sub = sub {
7023             use strict "refs";
7024             my $self = shift;
7025
7026             return if $self->carp_if_locked;
7027             return $self->_range_list->$sub(@_);
7028         }
7029     }
7030 } # End closure for Map_Table
7031
7032 package Match_Table;
7033 use parent '-norequire', '_Base_Table';
7034
7035 # A Match table is one which is a list of all the code points that have
7036 # the same property and property value, for use in \p{property=value}
7037 # constructs in regular expressions.  It adds very little data to the base
7038 # structure, but many methods, as these lists can be combined in many ways to
7039 # form new ones.
7040 # There are only a few concepts added:
7041 # 1) Equivalents and Relatedness.
7042 #    Two tables can match the identical code points, but have different names.
7043 #    This always happens when there is a perl single form extension
7044 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
7045 #    tables are set to be related, with the Perl extension being a child, and
7046 #    the Unicode property being the parent.
7047 #
7048 #    It may be that two tables match the identical code points and we don't
7049 #    know if they are related or not.  This happens most frequently when the
7050 #    Block and Script properties have the exact range.  But note that a
7051 #    revision to Unicode could add new code points to the script, which would
7052 #    now have to be in a different block (as the block was filled, or there
7053 #    would have been 'Unknown' script code points in it and they wouldn't have
7054 #    been identical).  So we can't rely on any two properties from Unicode
7055 #    always matching the same code points from release to release, and thus
7056 #    these tables are considered coincidentally equivalent--not related.  When
7057 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
7058 #    'leader', and the others are 'equivalents'.  This concept is useful
7059 #    to minimize the number of tables written out.  Only one file is used for
7060 #    any identical set of code points, with entries in Heavy.pl mapping all
7061 #    the involved tables to it.
7062 #
7063 #    Related tables will always be identical; we set them up to be so.  Thus
7064 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
7065 #    unrelated tables.  Relatedness makes generating the documentation easier.
7066 #
7067 # 2) Complement.
7068 #    Like equivalents, two tables may be the inverses of each other, the
7069 #    intersection between them is null, and the union is every Unicode code
7070 #    point.  The two tables that occupy a binary property are necessarily like
7071 #    this.  By specifying one table as the complement of another, we can avoid
7072 #    storing it on disk (using the other table and performing a fast
7073 #    transform), and some memory and calculations.
7074 #
7075 # 3) Conflicting.  It may be that there will eventually be name clashes, with
7076 #    the same name meaning different things.  For a while, there actually were
7077 #    conflicts, but they have so far been resolved by changing Perl's or
7078 #    Unicode's definitions to match the other, but when this code was written,
7079 #    it wasn't clear that that was what was going to happen.  (Unicode changed
7080 #    because of protests during their beta period.)  Name clashes are warned
7081 #    about during compilation, and the documentation.  The generated tables
7082 #    are sane, free of name clashes, because the code suppresses the Perl
7083 #    version.  But manual intervention to decide what the actual behavior
7084 #    should be may be required should this happen.  The introductory comments
7085 #    have more to say about this.
7086
7087 sub standardize { return main::standardize($_[0]); }
7088 sub trace { return main::trace(@_); }
7089
7090
7091 { # Closure
7092
7093     main::setup_package();
7094
7095     my %leader;
7096     # The leader table of this one; initially $self.
7097     main::set_access('leader', \%leader, 'r');
7098
7099     my %equivalents;
7100     # An array of any tables that have this one as their leader
7101     main::set_access('equivalents', \%equivalents, 'readable_array');
7102
7103     my %parent;
7104     # The parent table to this one, initially $self.  This allows us to
7105     # distinguish between equivalent tables that are related (for which this
7106     # is set to), and those which may not be, but share the same output file
7107     # because they match the exact same set of code points in the current
7108     # Unicode release.
7109     main::set_access('parent', \%parent, 'r');
7110
7111     my %children;
7112     # An array of any tables that have this one as their parent
7113     main::set_access('children', \%children, 'readable_array');
7114
7115     my %conflicting;
7116     # Array of any tables that would have the same name as this one with
7117     # a different meaning.  This is used for the generated documentation.
7118     main::set_access('conflicting', \%conflicting, 'readable_array');
7119
7120     my %matches_all;
7121     # Set in the constructor for tables that are expected to match all code
7122     # points.
7123     main::set_access('matches_all', \%matches_all, 'r');
7124
7125     my %complement;
7126     # Points to the complement that this table is expressed in terms of; 0 if
7127     # none.
7128     main::set_access('complement', \%complement, 'r');
7129
7130     sub new {
7131         my $class = shift;
7132
7133         my %args = @_;
7134
7135         # The property for which this table is a listing of property values.
7136         my $property = delete $args{'_Property'};
7137
7138         my $name = delete $args{'Name'};
7139         my $full_name = delete $args{'Full_Name'};
7140         $full_name = $name if ! defined $full_name;
7141
7142         # Optional
7143         my $initialize = delete $args{'Initialize'};
7144         my $matches_all = delete $args{'Matches_All'} || 0;
7145         my $format = delete $args{'Format'};
7146         # Rest of parameters passed on.
7147
7148         my $range_list = Range_List->new(Initialize => $initialize,
7149                                          Owner => $property);
7150
7151         my $complete = $full_name;
7152         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
7153                                               # but this helps debug if it
7154                                               # does
7155         # The complete name for a match table includes it's property in a
7156         # compound form 'property=table', except if the property is the
7157         # pseudo-property, perl, in which case it is just the single form,
7158         # 'table' (If you change the '=' must also change the ':' in lots of
7159         # places in this program that assume an equal sign)
7160         $complete = $property->full_name . "=$complete" if $property != $perl;
7161
7162         my $self = $class->SUPER::new(%args,
7163                                       Name => $name,
7164                                       Complete_Name => $complete,
7165                                       Full_Name => $full_name,
7166                                       _Property => $property,
7167                                       _Range_List => $range_list,
7168                                       Format => $EMPTY_FORMAT,
7169                                       );
7170         my $addr = do { no overloading; pack 'J', $self; };
7171
7172         $conflicting{$addr} = [ ];
7173         $equivalents{$addr} = [ ];
7174         $children{$addr} = [ ];
7175         $matches_all{$addr} = $matches_all;
7176         $leader{$addr} = $self;
7177         $parent{$addr} = $self;
7178         $complement{$addr} = 0;
7179
7180         if (defined $format && $format ne $EMPTY_FORMAT) {
7181             Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
7182         }
7183
7184         return $self;
7185     }
7186
7187     # See this program's beginning comment block about overloading these.
7188     use overload
7189         fallback => 0,
7190         qw("") => "_operator_stringify",
7191         '=' => sub {
7192                     my $self = shift;
7193
7194                     return if $self->carp_if_locked;
7195                     return $self;
7196                 },
7197
7198         '+' => sub {
7199                         my $self = shift;
7200                         my $other = shift;
7201
7202                         return $self->_range_list + $other;
7203                     },
7204         '&' => sub {
7205                         my $self = shift;
7206                         my $other = shift;
7207
7208                         return $self->_range_list & $other;
7209                     },
7210         '+=' => sub {
7211                         my $self = shift;
7212                         my $other = shift;
7213                         my $reversed = shift;
7214
7215                         if ($reversed) {
7216                             Carp::my_carp_bug("Bad news.  Can't cope with '"
7217                             . ref($other)
7218                             . ' += '
7219                             . ref($self)
7220                             . "'.  undef returned.");
7221                             return;
7222                         }
7223
7224                         return if $self->carp_if_locked;
7225
7226                         my $addr = do { no overloading; pack 'J', $self; };
7227
7228                         if (ref $other) {
7229
7230                             # Change the range list of this table to be the
7231                             # union of the two.
7232                             $self->_set_range_list($self->_range_list
7233                                                     + $other);
7234                         }
7235                         else {    # $other is just a simple value
7236                             $self->add_range($other, $other);
7237                         }
7238                         return $self;
7239                     },
7240         '&=' => sub {
7241                         my $self = shift;
7242                         my $other = shift;
7243                         my $reversed = shift;
7244
7245                         if ($reversed) {
7246                             Carp::my_carp_bug("Bad news.  Can't cope with '"
7247                             . ref($other)
7248                             . ' &= '
7249                             . ref($self)
7250                             . "'.  undef returned.");
7251                             return;
7252                         }
7253
7254                         return if $self->carp_if_locked;
7255                         $self->_set_range_list($self->_range_list & $other);
7256                         return $self;
7257                     },
7258         '-' => sub { my $self = shift;
7259                     my $other = shift;
7260                     my $reversed = shift;
7261                     if ($reversed) {
7262                         Carp::my_carp_bug("Bad news.  Can't cope with '"
7263                         . ref($other)
7264                         . ' - '
7265                         . ref($self)
7266                         . "'.  undef returned.");
7267                         return;
7268                     }
7269
7270                     return $self->_range_list - $other;
7271                 },
7272         '~' => sub { my $self = shift;
7273                     return ~ $self->_range_list;
7274                 },
7275     ;
7276
7277     sub _operator_stringify {
7278         my $self = shift;
7279
7280         my $name = $self->complete_name;
7281         return "Table '$name'";
7282     }
7283
7284     sub _range_list {
7285         # Returns the range list associated with this table, which will be the
7286         # complement's if it has one.
7287
7288         my $self = shift;
7289         my $complement;
7290         if (($complement = $self->complement) != 0) {
7291             return ~ $complement->_range_list;
7292         }
7293         else {
7294             return $self->SUPER::_range_list;
7295         }
7296     }
7297
7298     sub add_alias {
7299         # Add a synonym for this table.  See the comments in the base class
7300
7301         my $self = shift;
7302         my $name = shift;
7303         # Rest of parameters passed on.
7304
7305         $self->SUPER::add_alias($name, $self, @_);
7306         return;
7307     }
7308
7309     sub add_conflicting {
7310         # Add the name of some other object to the list of ones that name
7311         # clash with this match table.
7312
7313         my $self = shift;
7314         my $conflicting_name = shift;   # The name of the conflicting object
7315         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
7316         my $conflicting_object = shift; # Optional, the conflicting object
7317                                         # itself.  This is used to
7318                                         # disambiguate the text if the input
7319                                         # name is identical to any of the
7320                                         # aliases $self is known by.
7321                                         # Sometimes the conflicting object is
7322                                         # merely hypothetical, so this has to
7323                                         # be an optional parameter.
7324         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7325
7326         my $addr = do { no overloading; pack 'J', $self; };
7327
7328         # Check if the conflicting name is exactly the same as any existing
7329         # alias in this table (as long as there is a real object there to
7330         # disambiguate with).
7331         if (defined $conflicting_object) {
7332             foreach my $alias ($self->aliases) {
7333                 if ($alias->name eq $conflicting_name) {
7334
7335                     # Here, there is an exact match.  This results in
7336                     # ambiguous comments, so disambiguate by changing the
7337                     # conflicting name to its object's complete equivalent.
7338                     $conflicting_name = $conflicting_object->complete_name;
7339                     last;
7340                 }
7341             }
7342         }
7343
7344         # Convert to the \p{...} final name
7345         $conflicting_name = "\\$p" . "{$conflicting_name}";
7346
7347         # Only add once
7348         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7349
7350         push @{$conflicting{$addr}}, $conflicting_name;
7351
7352         return;
7353     }
7354
7355     sub is_set_equivalent_to {
7356         # Return boolean of whether or not the other object is a table of this
7357         # type and has been marked equivalent to this one.
7358
7359         my $self = shift;
7360         my $other = shift;
7361         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7362
7363         return 0 if ! defined $other; # Can happen for incomplete early
7364                                       # releases
7365         unless ($other->isa(__PACKAGE__)) {
7366             my $ref_other = ref $other;
7367             my $ref_self = ref $self;
7368             Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
7369             return 0;
7370         }
7371
7372         # Two tables are equivalent if they have the same leader.
7373         no overloading;
7374         return $leader{pack 'J', $self} == $leader{pack 'J', $other};
7375         return;
7376     }
7377
7378     sub set_equivalent_to {
7379         # Set $self equivalent to the parameter table.
7380         # The required Related => 'x' parameter is a boolean indicating
7381         # whether these tables are related or not.  If related, $other becomes
7382         # the 'parent' of $self; if unrelated it becomes the 'leader'
7383         #
7384         # Related tables share all characteristics except names; equivalents
7385         # not quite so many.
7386         # If they are related, one must be a perl extension.  This is because
7387         # we can't guarantee that Unicode won't change one or the other in a
7388         # later release even if they are identical now.
7389
7390         my $self = shift;
7391         my $other = shift;
7392
7393         my %args = @_;
7394         my $related = delete $args{'Related'};
7395
7396         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7397
7398         return if ! defined $other;     # Keep on going; happens in some early
7399                                         # Unicode releases.
7400
7401         if (! defined $related) {
7402             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
7403             $related = 0;
7404         }
7405
7406         # If already are equivalent, no need to re-do it;  if subroutine
7407         # returns null, it found an error, also do nothing
7408         my $are_equivalent = $self->is_set_equivalent_to($other);
7409         return if ! defined $are_equivalent || $are_equivalent;
7410
7411         my $addr = do { no overloading; pack 'J', $self; };
7412         my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7413
7414         if ($related) {
7415             if ($current_leader->perl_extension) {
7416                 if ($other->perl_extension) {
7417                     Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7418                     return;
7419                 }
7420             } elsif ($self->property != $other->property    # Depending on
7421                                                             # situation, might
7422                                                             # be better to use
7423                                                             # add_alias()
7424                                                             # instead for same
7425                                                             # property
7426                      && ! $other->perl_extension)
7427             {
7428                 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
7429                 $related = 0;
7430             }
7431         }
7432
7433         if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7434             Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
7435             return;
7436         }
7437
7438         my $leader = do { no overloading; pack 'J', $current_leader; };
7439         my $other_addr = do { no overloading; pack 'J', $other; };
7440
7441         # Any tables that are equivalent to or children of this table must now
7442         # instead be equivalent to or (children) to the new leader (parent),
7443         # still equivalent.  The equivalency includes their matches_all info,
7444         # and for related tables, their fate and status.
7445         # All related tables are of necessity equivalent, but the converse
7446         # isn't necessarily true
7447         my $status = $other->status;
7448         my $status_info = $other->status_info;
7449         my $fate = $other->fate;
7450         my $matches_all = $matches_all{other_addr};
7451         my $caseless_equivalent = $other->caseless_equivalent;
7452         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7453             next if $table == $other;
7454             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7455
7456             my $table_addr = do { no overloading; pack 'J', $table; };
7457             $leader{$table_addr} = $other;
7458             $matches_all{$table_addr} = $matches_all;
7459             $self->_set_range_list($other->_range_list);
7460             push @{$equivalents{$other_addr}}, $table;
7461             if ($related) {
7462                 $parent{$table_addr} = $other;
7463                 push @{$children{$other_addr}}, $table;
7464                 $table->set_status($status, $status_info);
7465
7466                 # This reason currently doesn't get exposed outside; otherwise
7467                 # would have to look up the parent's reason and use it instead.
7468                 $table->set_fate($fate, "Parent's fate");
7469
7470                 $self->set_caseless_equivalent($caseless_equivalent);
7471             }
7472         }
7473
7474         # Now that we've declared these to be equivalent, any changes to one
7475         # of the tables would invalidate that equivalency.
7476         $self->lock;
7477         $other->lock;
7478         return;
7479     }
7480
7481     sub set_complement {
7482         # Set $self to be the complement of the parameter table.  $self is
7483         # locked, as what it contains should all come from the other table.
7484
7485         my $self = shift;
7486         my $other = shift;
7487
7488         my %args = @_;
7489         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7490
7491         if ($other->complement != 0) {
7492             Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7493             return;
7494         }
7495         my $addr = do { no overloading; pack 'J', $self; };
7496         $complement{$addr} = $other;
7497         $self->lock;
7498         return;
7499     }
7500
7501     sub add_range { # Add a range to the list for this table.
7502         my $self = shift;
7503         # Rest of parameters passed on
7504
7505         return if $self->carp_if_locked;
7506         return $self->_range_list->add_range(@_);
7507     }
7508
7509     sub header {
7510         my $self = shift;
7511         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7512
7513         # All match tables are to be used only by the Perl core.
7514         return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
7515     }
7516
7517     sub pre_body {  # Does nothing for match tables.
7518         return
7519     }
7520
7521     sub append_to_body {  # Does nothing for match tables.
7522         return
7523     }
7524
7525     sub set_fate {
7526         my $self = shift;
7527         my $fate = shift;
7528         my $reason = shift;
7529         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7530
7531         $self->SUPER::set_fate($fate, $reason);
7532
7533         # All children share this fate
7534         foreach my $child ($self->children) {
7535             $child->set_fate($fate, $reason);
7536         }
7537         return;
7538     }
7539
7540     sub write {
7541         my $self = shift;
7542         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7543
7544         return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops
7545     }
7546
7547     sub set_final_comment {
7548         # This creates a comment for the file that is to hold the match table
7549         # $self.  It is somewhat convoluted to make the English read nicely,
7550         # but, heh, it's just a comment.
7551         # This should be called only with the leader match table of all the
7552         # ones that share the same file.  It lists all such tables, ordered so
7553         # that related ones are together.
7554
7555         return unless $debugging_build;
7556
7557         my $leader = shift;   # Should only be called on the leader table of
7558                               # an equivalent group
7559         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7560
7561         my $addr = do { no overloading; pack 'J', $leader; };
7562
7563         if ($leader{$addr} != $leader) {
7564             Carp::my_carp_bug(<<END
7565 set_final_comment() must be called on a leader table, which $leader is not.
7566 It is equivalent to $leader{$addr}.  No comment created
7567 END
7568             );
7569             return;
7570         }
7571
7572         # Get the number of code points matched by each of the tables in this
7573         # file, and add underscores for clarity.
7574         my $count = $leader->count;
7575         my $string_count = main::clarify_number($count);
7576
7577         my $loose_count = 0;        # how many aliases loosely matched
7578         my $compound_name = "";     # ? Are any names compound?, and if so, an
7579                                     # example
7580         my $properties_with_compound_names = 0;    # count of these
7581
7582
7583         my %flags;              # The status flags used in the file
7584         my $total_entries = 0;  # number of entries written in the comment
7585         my $matches_comment = ""; # The portion of the comment about the
7586                                   # \p{}'s
7587         my @global_comments;    # List of all the tables' comments that are
7588                                 # there before this routine was called.
7589         my $has_ucd_alias = 0;  # If there is an alias that is accessible via
7590                                 # Unicode::UCD.  If not, then don't say it is
7591                                 # in the comment
7592
7593         # Get list of all the parent tables that are equivalent to this one
7594         # (including itself).
7595         my @parents = grep { $parent{main::objaddr $_} == $_ }
7596                             main::uniques($leader, @{$equivalents{$addr}});
7597         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
7598                                               # tables
7599
7600         for my $parent (@parents) {
7601
7602             my $property = $parent->property;
7603
7604             # Special case 'N' tables in properties with two match tables when
7605             # the other is a 'Y' one.  These are likely to be binary tables,
7606             # but not necessarily.  In either case, \P{} will match the
7607             # complement of \p{}, and so if something is a synonym of \p, the
7608             # complement of that something will be the synonym of \P.  This
7609             # would be true of any property with just two match tables, not
7610             # just those whose values are Y and N; but that would require a
7611             # little extra work, and there are none such so far in Unicode.
7612             my $perl_p = 'p';        # which is it?  \p{} or \P{}
7613             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
7614
7615             if (scalar $property->tables == 2
7616                 && $parent == $property->table('N')
7617                 && defined (my $yes = $property->table('Y')))
7618             {
7619                 my $yes_addr = do { no overloading; pack 'J', $yes; };
7620                 @yes_perl_synonyms
7621                     = grep { $_->property == $perl }
7622                                     main::uniques($yes,
7623                                                 $parent{$yes_addr},
7624                                                 $parent{$yes_addr}->children);
7625
7626                 # But these synonyms are \P{} ,not \p{}
7627                 $perl_p = 'P';
7628             }
7629
7630             my @description;        # Will hold the table description
7631             my @note;               # Will hold the table notes.
7632             my @conflicting;        # Will hold the table conflicts.
7633
7634             # Look at the parent, any yes synonyms, and all the children
7635             my $parent_addr = do { no overloading; pack 'J', $parent; };
7636             for my $table ($parent,
7637                            @yes_perl_synonyms,
7638                            @{$children{$parent_addr}})
7639             {
7640                 my $table_addr = do { no overloading; pack 'J', $table; };
7641                 my $table_property = $table->property;
7642
7643                 # Tables are separated by a blank line to create a grouping.
7644                 $matches_comment .= "\n" if $matches_comment;
7645
7646                 # The table is named based on the property and value
7647                 # combination it is for, like script=greek.  But there may be
7648                 # a number of synonyms for each side, like 'sc' for 'script',
7649                 # and 'grek' for 'greek'.  Any combination of these is a valid
7650                 # name for this table.  In this case, there are three more,
7651                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
7652                 # listing all possible combinations in the comment, we make
7653                 # sure that each synonym occurs at least once, and add
7654                 # commentary that the other combinations are possible.
7655                 # Because regular expressions don't recognize things like
7656                 # \p{jsn=}, only look at non-null right-hand-sides
7657                 my @property_aliases = $table_property->aliases;
7658                 my @table_aliases = grep { $_->name ne "" } $table->aliases;
7659
7660                 # The alias lists above are already ordered in the order we
7661                 # want to output them.  To ensure that each synonym is listed,
7662                 # we must use the max of the two numbers.  But if there are no
7663                 # legal synonyms (nothing in @table_aliases), then we don't
7664                 # list anything.
7665                 my $listed_combos = (@table_aliases)
7666                                     ?  main::max(scalar @table_aliases,
7667                                                  scalar @property_aliases)
7668                                     : 0;
7669                 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7670
7671
7672                 my $property_had_compound_name = 0;
7673
7674                 for my $i (0 .. $listed_combos - 1) {
7675                     $total_entries++;
7676
7677                     # The current alias for the property is the next one on
7678                     # the list, or if beyond the end, start over.  Similarly
7679                     # for the table (\p{prop=table})
7680                     my $property_alias = $property_aliases
7681                                             [$i % @property_aliases]->name;
7682                     my $table_alias_object = $table_aliases
7683                                                         [$i % @table_aliases];
7684                     my $table_alias = $table_alias_object->name;
7685                     my $loose_match = $table_alias_object->loose_match;
7686                     $has_ucd_alias |= $table_alias_object->ucd;
7687
7688                     if ($table_alias !~ /\D/) { # Clarify large numbers.
7689                         $table_alias = main::clarify_number($table_alias)
7690                     }
7691
7692                     # Add a comment for this alias combination
7693                     my $current_match_comment;
7694                     if ($table_property == $perl) {
7695                         $current_match_comment = "\\$perl_p"
7696                                                     . "{$table_alias}";
7697                     }
7698                     else {
7699                         $current_match_comment
7700                                         = "\\p{$property_alias=$table_alias}";
7701                         $property_had_compound_name = 1;
7702                     }
7703
7704                     # Flag any abnormal status for this table.
7705                     my $flag = $property->status
7706                                 || $table->status
7707                                 || $table_alias_object->status;
7708                     if ($flag && $flag ne $PLACEHOLDER) {
7709                         $flags{$flag} = $status_past_participles{$flag};
7710                     }
7711
7712                     $loose_count++;
7713
7714                     # Pretty up the comment.  Note the \b; it says don't make
7715                     # this line a continuation.
7716                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
7717                                         $flag,
7718                                         " " x 7,
7719                                         $current_match_comment);
7720                 } # End of generating the entries for this table.
7721
7722                 # Save these for output after this group of related tables.
7723                 push @description, $table->description;
7724                 push @note, $table->note;
7725                 push @conflicting, $table->conflicting;
7726
7727                 # And this for output after all the tables.
7728                 push @global_comments, $table->comment;
7729
7730                 # Compute an alternate compound name using the final property
7731                 # synonym and the first table synonym with a colon instead of
7732                 # the equal sign used elsewhere.
7733                 if ($property_had_compound_name) {
7734                     $properties_with_compound_names ++;
7735                     if (! $compound_name || @property_aliases > 1) {
7736                         $compound_name = $property_aliases[-1]->name
7737                                         . ': '
7738                                         . $table_aliases[0]->name;
7739                     }
7740                 }
7741             } # End of looping through all children of this table
7742
7743             # Here have assembled in $matches_comment all the related tables
7744             # to the current parent (preceded by the same info for all the
7745             # previous parents).  Put out information that applies to all of
7746             # the current family.
7747             if (@conflicting) {
7748
7749                 # But output the conflicting information now, as it applies to
7750                 # just this table.
7751                 my $conflicting = join ", ", @conflicting;
7752                 if ($conflicting) {
7753                     $matches_comment .= <<END;
7754
7755     Note that contrary to what you might expect, the above is NOT the same as
7756 END
7757                     $matches_comment .= "any of: " if @conflicting > 1;
7758                     $matches_comment .= "$conflicting\n";
7759                 }
7760             }
7761             if (@description) {
7762                 $matches_comment .= "\n    Meaning: "
7763                                     . join('; ', @description)
7764                                     . "\n";
7765             }
7766             if (@note) {
7767                 $matches_comment .= "\n    Note: "
7768                                     . join("\n    ", @note)
7769                                     . "\n";
7770             }
7771         } # End of looping through all tables
7772
7773
7774         my $code_points;
7775         my $match;
7776         my $any_of_these;
7777         if ($count == 1) {
7778             $match = 'matches';
7779             $code_points = 'single code point';
7780         }
7781         else {
7782             $match = 'match';
7783             $code_points = "$string_count code points";
7784         }
7785
7786         my $synonyms;
7787         my $entries;
7788         if ($total_entries == 1) {
7789             $synonyms = "";
7790             $entries = 'entry';
7791             $any_of_these = 'this'
7792         }
7793         else {
7794             $synonyms = " any of the following regular expression constructs";
7795             $entries = 'entries';
7796             $any_of_these = 'any of these'
7797         }
7798
7799         my $comment = "";
7800         if ($has_ucd_alias) {
7801             $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
7802         }
7803         if ($has_unrelated) {
7804             $comment .= <<END;
7805 This file is for tables that are not necessarily related:  To conserve
7806 resources, every table that matches the identical set of code points in this
7807 version of Unicode uses this file.  Each one is listed in a separate group
7808 below.  It could be that the tables will match the same set of code points in
7809 other Unicode releases, or it could be purely coincidence that they happen to
7810 be the same in Unicode $string_version, and hence may not in other versions.
7811
7812 END
7813         }
7814
7815         if (%flags) {
7816             foreach my $flag (sort keys %flags) {
7817                 $comment .= <<END;
7818 '$flag' below means that this form is $flags{$flag}.
7819 Consult $pod_file.pod
7820 END
7821             }
7822             $comment .= "\n";
7823         }
7824
7825         if ($total_entries == 0) {
7826             Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
7827             $comment .= <<END;
7828 This file returns the $code_points in Unicode Version $string_version for
7829 $leader, but it is inaccessible through Perl regular expressions, as
7830 "\\p{prop=}" is not recognized.
7831 END
7832
7833         } else {
7834             $comment .= <<END;
7835 This file returns the $code_points in Unicode Version $string_version that
7836 $match$synonyms:
7837
7838 $matches_comment
7839 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
7840 including if adding or subtracting white space, underscore, and hyphen
7841 characters matters or doesn't matter, and other permissible syntactic
7842 variants.  Upper/lower case distinctions never matter.
7843 END
7844
7845         }
7846         if ($compound_name) {
7847             $comment .= <<END;
7848
7849 A colon can be substituted for the equals sign, and
7850 END
7851             if ($properties_with_compound_names > 1) {
7852                 $comment .= <<END;
7853 within each group above,
7854 END
7855             }
7856             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7857
7858             # Note the \b below, it says don't make that line a continuation.
7859             $comment .= <<END;
7860 anything to the left of the equals (or colon) can be combined with anything to
7861 the right.  Thus, for example,
7862 $compound_name
7863 \bis also valid.
7864 END
7865         }
7866
7867         # And append any comment(s) from the actual tables.  They are all
7868         # gathered here, so may not read all that well.
7869         if (@global_comments) {
7870             $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7871         }
7872
7873         if ($count) {   # The format differs if no code points, and needs no
7874                         # explanation in that case
7875                 $comment.= <<END;
7876
7877 The format of the lines of this file is:
7878 END
7879             $comment.= <<END;
7880 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7881 STOP is the ending point, or if omitted, the range has just one code point.
7882 END
7883             if ($leader->output_range_counts) {
7884                 $comment .= <<END;
7885 Numbers in comments in [brackets] indicate how many code points are in the
7886 range.
7887 END
7888             }
7889         }
7890
7891         $leader->set_comment(main::join_lines($comment));
7892         return;
7893     }
7894
7895     # Accessors for the underlying list
7896     for my $sub (qw(
7897                     get_valid_code_point
7898                     get_invalid_code_point
7899                 ))
7900     {
7901         no strict "refs";
7902         *$sub = sub {
7903             use strict "refs";
7904             my $self = shift;
7905
7906             return $self->_range_list->$sub(@_);
7907         }
7908     }
7909 } # End closure for Match_Table
7910
7911 package Property;
7912
7913 # The Property class represents a Unicode property, or the $perl
7914 # pseudo-property.  It contains a map table initialized empty at construction
7915 # time, and for properties accessible through regular expressions, various
7916 # match tables, created through the add_match_table() method, and referenced
7917 # by the table('NAME') or tables() methods, the latter returning a list of all
7918 # of the match tables.  Otherwise table operations implicitly are for the map
7919 # table.
7920 #
7921 # Most of the data in the property is actually about its map table, so it
7922 # mostly just uses that table's accessors for most methods.  The two could
7923 # have been combined into one object, but for clarity because of their
7924 # differing semantics, they have been kept separate.  It could be argued that
7925 # the 'file' and 'directory' fields should be kept with the map table.
7926 #
7927 # Each property has a type.  This can be set in the constructor, or in the
7928 # set_type accessor, but mostly it is figured out by the data.  Every property
7929 # starts with unknown type, overridden by a parameter to the constructor, or
7930 # as match tables are added, or ranges added to the map table, the data is
7931 # inspected, and the type changed.  After the table is mostly or entirely
7932 # filled, compute_type() should be called to finalize they analysis.
7933 #
7934 # There are very few operations defined.  One can safely remove a range from
7935 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7936 # table to this one, replacing any in the intersection of the two.
7937
7938 sub standardize { return main::standardize($_[0]); }
7939 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7940
7941 {   # Closure
7942
7943     # This hash will contain as keys, all the aliases of all properties, and
7944     # as values, pointers to their respective property objects.  This allows
7945     # quick look-up of a property from any of its names.
7946     my %alias_to_property_of;
7947
7948     sub dump_alias_to_property_of {
7949         # For debugging
7950
7951         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7952         return;
7953     }
7954
7955     sub property_ref {
7956         # This is a package subroutine, not called as a method.
7957         # If the single parameter is a literal '*' it returns a list of all
7958         # defined properties.
7959         # Otherwise, the single parameter is a name, and it returns a pointer
7960         # to the corresponding property object, or undef if none.
7961         #
7962         # Properties can have several different names.  The 'standard' form of
7963         # each of them is stored in %alias_to_property_of as they are defined.
7964         # But it's possible that this subroutine will be called with some
7965         # variant, so if the initial lookup fails, it is repeated with the
7966         # standardized form of the input name.  If found, besides returning the
7967         # result, the input name is added to the list so future calls won't
7968         # have to do the conversion again.
7969
7970         my $name = shift;
7971
7972         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7973
7974         if (! defined $name) {
7975             Carp::my_carp_bug("Undefined input property.  No action taken.");
7976             return;
7977         }
7978
7979         return main::uniques(values %alias_to_property_of) if $name eq '*';
7980
7981         # Return cached result if have it.
7982         my $result = $alias_to_property_of{$name};
7983         return $result if defined $result;
7984
7985         # Convert the input to standard form.
7986         my $standard_name = standardize($name);
7987
7988         $result = $alias_to_property_of{$standard_name};
7989         return unless defined $result;        # Don't cache undefs
7990
7991         # Cache the result before returning it.
7992         $alias_to_property_of{$name} = $result;
7993         return $result;
7994     }
7995
7996
7997     main::setup_package();
7998
7999     my %map;
8000     # A pointer to the map table object for this property
8001     main::set_access('map', \%map);
8002
8003     my %full_name;
8004     # The property's full name.  This is a duplicate of the copy kept in the
8005     # map table, but is needed because stringify needs it during
8006     # construction of the map table, and then would have a chicken before egg
8007     # problem.
8008     main::set_access('full_name', \%full_name, 'r');
8009
8010     my %table_ref;
8011     # This hash will contain as keys, all the aliases of any match tables
8012     # attached to this property, and as values, the pointers to their
8013     # respective tables.  This allows quick look-up of a table from any of its
8014     # names.
8015     main::set_access('table_ref', \%table_ref);
8016
8017     my %type;
8018     # The type of the property, $ENUM, $BINARY, etc
8019     main::set_access('type', \%type, 'r');
8020
8021     my %file;
8022     # The filename where the map table will go (if actually written).
8023     # Normally defaulted, but can be overridden.
8024     main::set_access('file', \%file, 'r', 's');
8025
8026     my %directory;
8027     # The directory where the map table will go (if actually written).
8028     # Normally defaulted, but can be overridden.
8029     main::set_access('directory', \%directory, 's');
8030
8031     my %pseudo_map_type;
8032     # This is used to affect the calculation of the map types for all the
8033     # ranges in the table.  It should be set to one of the values that signify
8034     # to alter the calculation.
8035     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
8036
8037     my %has_only_code_point_maps;
8038     # A boolean used to help in computing the type of data in the map table.
8039     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
8040
8041     my %unique_maps;
8042     # A list of the first few distinct mappings this property has.  This is
8043     # used to disambiguate between binary and enum property types, so don't
8044     # have to keep more than three.
8045     main::set_access('unique_maps', \%unique_maps);
8046
8047     my %pre_declared_maps;
8048     # A boolean that gives whether the input data should declare all the
8049     # tables used, or not.  If the former, unknown ones raise a warning.
8050     main::set_access('pre_declared_maps',
8051                                     \%pre_declared_maps, 'r', 's');
8052
8053     sub new {
8054         # The only required parameter is the positionally first, name.  All
8055         # other parameters are key => value pairs.  See the documentation just
8056         # above for the meanings of the ones not passed directly on to the map
8057         # table constructor.
8058
8059         my $class = shift;
8060         my $name = shift || "";
8061
8062         my $self = property_ref($name);
8063         if (defined $self) {
8064             my $options_string = join ", ", @_;
8065             $options_string = ".  Ignoring options $options_string" if $options_string;
8066             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
8067             return $self;
8068         }
8069
8070         my %args = @_;
8071
8072         $self = bless \do { my $anonymous_scalar }, $class;
8073         my $addr = do { no overloading; pack 'J', $self; };
8074
8075         $directory{$addr} = delete $args{'Directory'};
8076         $file{$addr} = delete $args{'File'};
8077         $full_name{$addr} = delete $args{'Full_Name'} || $name;
8078         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
8079         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
8080         $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
8081                                     # Starting in this release, property
8082                                     # values should be defined for all
8083                                     # properties, except those overriding this
8084                                     // $v_version ge v5.1.0;
8085
8086         # Rest of parameters passed on.
8087
8088         $has_only_code_point_maps{$addr} = 1;
8089         $table_ref{$addr} = { };
8090         $unique_maps{$addr} = { };
8091
8092         $map{$addr} = Map_Table->new($name,
8093                                     Full_Name => $full_name{$addr},
8094                                     _Alias_Hash => \%alias_to_property_of,
8095                                     _Property => $self,
8096                                     %args);
8097         return $self;
8098     }
8099
8100     # See this program's beginning comment block about overloading the copy
8101     # constructor.  Few operations are defined on properties, but a couple are
8102     # useful.  It is safe to take the inverse of a property, and to remove a
8103     # single code point from it.
8104     use overload
8105         fallback => 0,
8106         qw("") => "_operator_stringify",
8107         "." => \&main::_operator_dot,
8108         ".=" => \&main::_operator_dot_equal,
8109         '==' => \&main::_operator_equal,
8110         '!=' => \&main::_operator_not_equal,
8111         '=' => sub { return shift },
8112         '-=' => "_minus_and_equal",
8113     ;
8114
8115     sub _operator_stringify {
8116         return "Property '" .  shift->full_name . "'";
8117     }
8118
8119     sub _minus_and_equal {
8120         # Remove a single code point from the map table of a property.
8121
8122         my $self = shift;
8123         my $other = shift;
8124         my $reversed = shift;
8125         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8126
8127         if (ref $other) {
8128             Carp::my_carp_bug("Bad news.  Can't cope with a "
8129                         . ref($other)
8130                         . " argument to '-='.  Subtraction ignored.");
8131             return $self;
8132         }
8133         elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
8134             Carp::my_carp_bug("Bad news.  Can't cope with subtracting a "
8135             . ref $self
8136             . " from a non-object.  undef returned.");
8137             return;
8138         }
8139         else {
8140             no overloading;
8141             $map{pack 'J', $self}->delete_range($other, $other);
8142         }
8143         return $self;
8144     }
8145
8146     sub add_match_table {
8147         # Add a new match table for this property, with name given by the
8148         # parameter.  It returns a pointer to the table.
8149
8150         my $self = shift;
8151         my $name = shift;
8152         my %args = @_;
8153
8154         my $addr = do { no overloading; pack 'J', $self; };
8155
8156         my $table = $table_ref{$addr}{$name};
8157         my $standard_name = main::standardize($name);
8158         if (defined $table
8159             || (defined ($table = $table_ref{$addr}{$standard_name})))
8160         {
8161             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
8162             $table_ref{$addr}{$name} = $table;
8163             return $table;
8164         }
8165         else {
8166
8167             # See if this is a perl extension, if not passed in.
8168             my $perl_extension = delete $args{'Perl_Extension'};
8169             $perl_extension
8170                         = $self->perl_extension if ! defined $perl_extension;
8171
8172             $table = Match_Table->new(
8173                                 Name => $name,
8174                                 Perl_Extension => $perl_extension,
8175                                 _Alias_Hash => $table_ref{$addr},
8176                                 _Property => $self,
8177
8178                                 # gets property's fate and status by default,
8179                                 # except if the name begind with an
8180                                 # underscore, default it to internal
8181                                 Fate => ($name =~ /^_/)
8182                                          ? $INTERNAL_ONLY
8183                                          : $self->fate,
8184                                 Status => $self->status,
8185                                 _Status_Info => $self->status_info,
8186                                 %args);
8187             return unless defined $table;
8188         }
8189
8190         # Save the names for quick look up
8191         $table_ref{$addr}{$standard_name} = $table;
8192         $table_ref{$addr}{$name} = $table;
8193
8194         # Perhaps we can figure out the type of this property based on the
8195         # fact of adding this match table.  First, string properties don't
8196         # have match tables; second, a binary property can't have 3 match
8197         # tables
8198         if ($type{$addr} == $UNKNOWN) {
8199             $type{$addr} = $NON_STRING;
8200         }
8201         elsif ($type{$addr} == $STRING) {
8202             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
8203             $type{$addr} = $NON_STRING;
8204         }
8205         elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
8206             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
8207                 if ($type{$addr} == $BINARY) {
8208                     Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
8209                 }
8210                 $type{$addr} = $ENUM;
8211             }
8212         }
8213
8214         return $table;
8215     }
8216
8217     sub delete_match_table {
8218         # Delete the table referred to by $2 from the property $1.
8219
8220         my $self = shift;
8221         my $table_to_remove = shift;
8222         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8223
8224         my $addr = do { no overloading; pack 'J', $self; };
8225
8226         # Remove all names that refer to it.
8227         foreach my $key (keys %{$table_ref{$addr}}) {
8228             delete $table_ref{$addr}{$key}
8229                                 if $table_ref{$addr}{$key} == $table_to_remove;
8230         }
8231
8232         $table_to_remove->DESTROY;
8233         return;
8234     }
8235
8236     sub table {
8237         # Return a pointer to the match table (with name given by the
8238         # parameter) associated with this property; undef if none.
8239
8240         my $self = shift;
8241         my $name = shift;
8242         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8243
8244         my $addr = do { no overloading; pack 'J', $self; };
8245
8246         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
8247
8248         # If quick look-up failed, try again using the standard form of the
8249         # input name.  If that succeeds, cache the result before returning so
8250         # won't have to standardize this input name again.
8251         my $standard_name = main::standardize($name);
8252         return unless defined $table_ref{$addr}{$standard_name};
8253
8254         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
8255         return $table_ref{$addr}{$name};
8256     }
8257
8258     sub tables {
8259         # Return a list of pointers to all the match tables attached to this
8260         # property
8261
8262         no overloading;
8263         return main::uniques(values %{$table_ref{pack 'J', shift}});
8264     }
8265
8266     sub directory {
8267         # Returns the directory the map table for this property should be
8268         # output in.  If a specific directory has been specified, that has
8269         # priority;  'undef' is returned if the type isn't defined;
8270         # or $map_directory for everything else.
8271
8272         my $addr = do { no overloading; pack 'J', shift; };
8273
8274         return $directory{$addr} if defined $directory{$addr};
8275         return undef if $type{$addr} == $UNKNOWN;
8276         return $map_directory;
8277     }
8278
8279     sub swash_name {
8280         # Return the name that is used to both:
8281         #   1)  Name the file that the map table is written to.
8282         #   2)  The name of swash related stuff inside that file.
8283         # The reason for this is that the Perl core historically has used
8284         # certain names that aren't the same as the Unicode property names.
8285         # To continue using these, $file is hard-coded in this file for those,
8286         # but otherwise the standard name is used.  This is different from the
8287         # external_name, so that the rest of the files, like in lib can use
8288         # the standard name always, without regard to historical precedent.
8289
8290         my $self = shift;
8291         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8292
8293         my $addr = do { no overloading; pack 'J', $self; };
8294
8295         # Swash names are used only on either
8296         # 1) legacy-only properties, because the formats for these are
8297         #    unchangeable, and they have had these lines in them; or
8298         # 2) regular map tables; otherwise there should be no access to the
8299         #    property map table from other parts of Perl.
8300         return if $map{$addr}->fate != $ORDINARY
8301                   && $map{$addr}->fate != $LEGACY_ONLY;
8302
8303         return $file{$addr} if defined $file{$addr};
8304         return $map{$addr}->external_name;
8305     }
8306
8307     sub to_create_match_tables {
8308         # Returns a boolean as to whether or not match tables should be
8309         # created for this property.
8310
8311         my $self = shift;
8312         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8313
8314         # The whole point of this pseudo property is match tables.
8315         return 1 if $self == $perl;
8316
8317         my $addr = do { no overloading; pack 'J', $self; };
8318
8319         # Don't generate tables of code points that match the property values
8320         # of a string property.  Such a list would most likely have many
8321         # property values, each with just one or very few code points mapping
8322         # to it.
8323         return 0 if $type{$addr} == $STRING;
8324
8325         # Don't generate anything for unimplemented properties.
8326         return 0 if grep { $self->complete_name eq $_ }
8327                                                     @unimplemented_properties;
8328         # Otherwise, do.
8329         return 1;
8330     }
8331
8332     sub property_add_or_replace_non_nulls {
8333         # This adds the mappings in the property $other to $self.  Non-null
8334         # mappings from $other override those in $self.  It essentially merges
8335         # the two properties, with the second having priority except for null
8336         # mappings.
8337
8338         my $self = shift;
8339         my $other = shift;
8340         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8341
8342         if (! $other->isa(__PACKAGE__)) {
8343             Carp::my_carp_bug("$other should be a "
8344                             . __PACKAGE__
8345                             . ".  Not a '"
8346                             . ref($other)
8347                             . "'.  Not added;");
8348             return;
8349         }
8350
8351         no overloading;
8352         return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
8353     }
8354
8355     sub set_proxy_for {
8356         # Certain tables are not generally written out to files, but
8357         # Unicode::UCD has the intelligence to know that the file for $self
8358         # can be used to reconstruct those tables.  This routine just changes
8359         # things so that UCD pod entries for those suppressed tables are
8360         # generated, so the fact that a proxy is used is invisible to the
8361         # user.
8362
8363         my $self = shift;
8364
8365         foreach my $property_name (@_) {
8366             my $ref = property_ref($property_name);
8367             next if $ref->to_output_map;
8368             $ref->set_fate($MAP_PROXIED);
8369         }
8370     }
8371
8372     sub set_type {
8373         # Set the type of the property.  Mostly this is figured out by the
8374         # data in the table.  But this is used to set it explicitly.  The
8375         # reason it is not a standard accessor is that when setting a binary
8376         # property, we need to make sure that all the true/false aliases are
8377         # present, as they were omitted in early Unicode releases.
8378
8379         my $self = shift;
8380         my $type = shift;
8381         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8382
8383         if ($type != $ENUM
8384             && $type != $BINARY
8385             && $type != $FORCED_BINARY
8386             && $type != $STRING)
8387         {
8388             Carp::my_carp("Unrecognized type '$type'.  Type not set");
8389             return;
8390         }
8391
8392         { no overloading; $type{pack 'J', $self} = $type; }
8393         return if $type != $BINARY && $type != $FORCED_BINARY;
8394
8395         my $yes = $self->table('Y');
8396         $yes = $self->table('Yes') if ! defined $yes;
8397         $yes = $self->add_match_table('Y', Full_Name => 'Yes')
8398                                                             if ! defined $yes;
8399
8400         # Add aliases in order wanted, duplicates will be ignored.  We use a
8401         # binary property present in all releases for its ordered lists of
8402         # true/false aliases.  Note, that could run into problems in
8403         # outputting things in that we don't distinguish between the name and
8404         # full name of these.  Hopefully, if the table was already created
8405         # before this code is executed, it was done with these set properly.
8406         my $bm = property_ref("Bidi_Mirrored");
8407         foreach my $alias ($bm->table("Y")->aliases) {
8408             $yes->add_alias($alias->name);
8409         }
8410         my $no = $self->table('N');
8411         $no = $self->table('No') if ! defined $no;
8412         $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
8413         foreach my $alias ($bm->table("N")->aliases) {
8414             $no->add_alias($alias->name);
8415         }
8416
8417         return;
8418     }
8419
8420     sub add_map {
8421         # Add a map to the property's map table.  This also keeps
8422         # track of the maps so that the property type can be determined from
8423         # its data.
8424
8425         my $self = shift;
8426         my $start = shift;  # First code point in range
8427         my $end = shift;    # Final code point in range
8428         my $map = shift;    # What the range maps to.
8429         # Rest of parameters passed on.
8430
8431         my $addr = do { no overloading; pack 'J', $self; };
8432
8433         # If haven't the type of the property, gather information to figure it
8434         # out.
8435         if ($type{$addr} == $UNKNOWN) {
8436
8437             # If the map contains an interior blank or dash, or most other
8438             # nonword characters, it will be a string property.  This
8439             # heuristic may actually miss some string properties.  If so, they
8440             # may need to have explicit set_types called for them.  This
8441             # happens in the Unihan properties.
8442             if ($map =~ / (?<= . ) [ -] (?= . ) /x
8443                 || $map =~ / [^\w.\/\ -]  /x)
8444             {
8445                 $self->set_type($STRING);
8446
8447                 # $unique_maps is used for disambiguating between ENUM and
8448                 # BINARY later; since we know the property is not going to be
8449                 # one of those, no point in keeping the data around
8450                 undef $unique_maps{$addr};
8451             }
8452             else {
8453
8454                 # Not necessarily a string.  The final decision has to be
8455                 # deferred until all the data are in.  We keep track of if all
8456                 # the values are code points for that eventual decision.
8457                 $has_only_code_point_maps{$addr} &=
8458                                             $map =~ / ^ $code_point_re $/x;
8459
8460                 # For the purposes of disambiguating between binary and other
8461                 # enumerations at the end, we keep track of the first three
8462                 # distinct property values.  Once we get to three, we know
8463                 # it's not going to be binary, so no need to track more.
8464                 if (scalar keys %{$unique_maps{$addr}} < 3) {
8465                     $unique_maps{$addr}{main::standardize($map)} = 1;
8466                 }
8467             }
8468         }
8469
8470         # Add the mapping by calling our map table's method
8471         return $map{$addr}->add_map($start, $end, $map, @_);
8472     }
8473
8474     sub compute_type {
8475         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
8476         # should be called after the property is mostly filled with its maps.
8477         # We have been keeping track of what the property values have been,
8478         # and now have the necessary information to figure out the type.
8479
8480         my $self = shift;
8481         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8482
8483         my $addr = do { no overloading; pack 'J', $self; };
8484
8485         my $type = $type{$addr};
8486
8487         # If already have figured these out, no need to do so again, but we do
8488         # a double check on ENUMS to make sure that a string property hasn't
8489         # improperly been classified as an ENUM, so continue on with those.
8490         return if $type == $STRING
8491                   || $type == $BINARY
8492                   || $type == $FORCED_BINARY;
8493
8494         # If every map is to a code point, is a string property.
8495         if ($type == $UNKNOWN
8496             && ($has_only_code_point_maps{$addr}
8497                 || (defined $map{$addr}->default_map
8498                     && $map{$addr}->default_map eq "")))
8499         {
8500             $self->set_type($STRING);
8501         }
8502         else {
8503
8504             # Otherwise, it is to some sort of enumeration.  (The case where
8505             # it is a Unicode miscellaneous property, and treated like a
8506             # string in this program is handled in add_map()).  Distinguish
8507             # between binary and some other enumeration type.  Of course, if
8508             # there are more than two values, it's not binary.  But more
8509             # subtle is the test that the default mapping is defined means it
8510             # isn't binary.  This in fact may change in the future if Unicode
8511             # changes the way its data is structured.  But so far, no binary
8512             # properties ever have @missing lines for them, so the default map
8513             # isn't defined for them.  The few properties that are two-valued
8514             # and aren't considered binary have the default map defined
8515             # starting in Unicode 5.0, when the @missing lines appeared; and
8516             # this program has special code to put in a default map for them
8517             # for earlier than 5.0 releases.
8518             if ($type == $ENUM
8519                 || scalar keys %{$unique_maps{$addr}} > 2
8520                 || defined $self->default_map)
8521             {
8522                 my $tables = $self->tables;
8523                 my $count = $self->count;
8524                 if ($verbosity && $tables > 500 && $tables/$count > .1) {
8525                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
8526                 }
8527                 $self->set_type($ENUM);
8528             }
8529             else {
8530                 $self->set_type($BINARY);
8531             }
8532         }
8533         undef $unique_maps{$addr};  # Garbage collect
8534         return;
8535     }
8536
8537     sub set_fate {
8538         my $self = shift;
8539         my $fate = shift;
8540         my $reason = shift;  # Ignored unless suppressing
8541         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8542
8543         my $addr = do { no overloading; pack 'J', $self; };
8544         if ($fate == $SUPPRESSED) {
8545             $why_suppressed{$self->complete_name} = $reason;
8546         }
8547
8548         # Each table shares the property's fate, except that MAP_PROXIED
8549         # doesn't affect match tables
8550         $map{$addr}->set_fate($fate, $reason);
8551         if ($fate != $MAP_PROXIED) {
8552             foreach my $table ($map{$addr}, $self->tables) {
8553                 $table->set_fate($fate, $reason);
8554             }
8555         }
8556         return;
8557     }
8558
8559
8560     # Most of the accessors for a property actually apply to its map table.
8561     # Setup up accessor functions for those, referring to %map
8562     for my $sub (qw(
8563                     add_alias
8564                     add_anomalous_entry
8565                     add_comment
8566                     add_conflicting
8567                     add_description
8568                     add_duplicate
8569                     add_note
8570                     aliases
8571                     comment
8572                     complete_name
8573                     containing_range
8574                     count
8575                     default_map
8576                     delete_range
8577                     description
8578                     each_range
8579                     external_name
8580                     fate
8581                     file_path
8582                     format
8583                     initialize
8584                     inverse_list
8585                     is_empty
8586                     replacement_property
8587                     name
8588                     note
8589                     perl_extension
8590                     property
8591                     range_count
8592                     ranges
8593                     range_size_1
8594                     reset_each_range
8595                     set_comment
8596                     set_default_map
8597                     set_file_path
8598                     set_final_comment
8599                     _set_format
8600                     set_range_size_1
8601                     set_status
8602                     set_to_output_map
8603                     short_name
8604                     status
8605                     status_info
8606                     to_output_map
8607                     type_of
8608                     value_of
8609                     write
8610                 ))
8611                     # 'property' above is for symmetry, so that one can take
8612                     # the property of a property and get itself, and so don't
8613                     # have to distinguish between properties and tables in
8614                     # calling code
8615     {
8616         no strict "refs";
8617         *$sub = sub {
8618             use strict "refs";
8619             my $self = shift;
8620             no overloading;
8621             return $map{pack 'J', $self}->$sub(@_);
8622         }
8623     }
8624
8625
8626 } # End closure
8627
8628 package main;
8629
8630 sub join_lines($) {
8631     # Returns lines of the input joined together, so that they can be folded
8632     # properly.
8633     # This causes continuation lines to be joined together into one long line
8634     # for folding.  A continuation line is any line that doesn't begin with a
8635     # space or "\b" (the latter is stripped from the output).  This is so
8636     # lines can be be in a HERE document so as to fit nicely in the terminal
8637     # width, but be joined together in one long line, and then folded with
8638     # indents, '#' prefixes, etc, properly handled.
8639     # A blank separates the joined lines except if there is a break; an extra
8640     # blank is inserted after a period ending a line.
8641
8642     # Initialize the return with the first line.
8643     my ($return, @lines) = split "\n", shift;
8644
8645     # If the first line is null, it was an empty line, add the \n back in
8646     $return = "\n" if $return eq "";
8647
8648     # Now join the remainder of the physical lines.
8649     for my $line (@lines) {
8650
8651         # An empty line means wanted a blank line, so add two \n's to get that
8652         # effect, and go to the next line.
8653         if (length $line == 0) {
8654             $return .= "\n\n";
8655             next;
8656         }
8657
8658         # Look at the last character of what we have so far.
8659         my $previous_char = substr($return, -1, 1);
8660
8661         # And at the next char to be output.
8662         my $next_char = substr($line, 0, 1);
8663
8664         if ($previous_char ne "\n") {
8665
8666             # Here didn't end wth a nl.  If the next char a blank or \b, it
8667             # means that here there is a break anyway.  So add a nl to the
8668             # output.
8669             if ($next_char eq " " || $next_char eq "\b") {
8670                 $previous_char = "\n";
8671                 $return .= $previous_char;
8672             }
8673
8674             # Add an extra space after periods.
8675             $return .= " " if $previous_char eq '.';
8676         }
8677
8678         # Here $previous_char is still the latest character to be output.  If
8679         # it isn't a nl, it means that the next line is to be a continuation
8680         # line, with a blank inserted between them.
8681         $return .= " " if $previous_char ne "\n";
8682
8683         # Get rid of any \b
8684         substr($line, 0, 1) = "" if $next_char eq "\b";
8685
8686         # And append this next line.
8687         $return .= $line;
8688     }
8689
8690     return $return;
8691 }
8692
8693 sub simple_fold($;$$$) {
8694     # Returns a string of the input (string or an array of strings) folded
8695     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8696     # a \n
8697     # This is tailored for the kind of text written by this program,
8698     # especially the pod file, which can have very long names with
8699     # underscores in the middle, or words like AbcDefgHij....  We allow
8700     # breaking in the middle of such constructs if the line won't fit
8701     # otherwise.  The break in such cases will come either just after an
8702     # underscore, or just before one of the Capital letters.
8703
8704     local $to_trace = 0 if main::DEBUG;
8705
8706     my $line = shift;
8707     my $prefix = shift;     # Optional string to prepend to each output
8708                             # line
8709     $prefix = "" unless defined $prefix;
8710
8711     my $hanging_indent = shift; # Optional number of spaces to indent
8712                                 # continuation lines
8713     $hanging_indent = 0 unless $hanging_indent;
8714
8715     my $right_margin = shift;   # Optional number of spaces to narrow the
8716                                 # total width by.
8717     $right_margin = 0 unless defined $right_margin;
8718
8719     # Call carp with the 'nofold' option to avoid it from trying to call us
8720     # recursively
8721     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8722
8723     # The space available doesn't include what's automatically prepended
8724     # to each line, or what's reserved on the right.
8725     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8726     # XXX Instead of using the 'nofold' perhaps better to look up the stack
8727
8728     if (DEBUG && $hanging_indent >= $max) {
8729         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
8730         $hanging_indent = 0;
8731     }
8732
8733     # First, split into the current physical lines.
8734     my @line;
8735     if (ref $line) {        # Better be an array, because not bothering to
8736                             # test
8737         foreach my $line (@{$line}) {
8738             push @line, split /\n/, $line;
8739         }
8740     }
8741     else {
8742         @line = split /\n/, $line;
8743     }
8744
8745     #local $to_trace = 1 if main::DEBUG;
8746     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8747
8748     # Look at each current physical line.
8749     for (my $i = 0; $i < @line; $i++) {
8750         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8751         #local $to_trace = 1 if main::DEBUG;
8752         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8753
8754         # Remove prefix, because will be added back anyway, don't want
8755         # doubled prefix
8756         $line[$i] =~ s/^$prefix//;
8757
8758         # Remove trailing space
8759         $line[$i] =~ s/\s+\Z//;
8760
8761         # If the line is too long, fold it.
8762         if (length $line[$i] > $max) {
8763             my $remainder;
8764
8765             # Here needs to fold.  Save the leading space in the line for
8766             # later.
8767             $line[$i] =~ /^ ( \s* )/x;
8768             my $leading_space = $1;
8769             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8770
8771             # If character at final permissible position is white space,
8772             # fold there, which will delete that white space
8773             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8774                 $remainder = substr($line[$i], $max);
8775                 $line[$i] = substr($line[$i], 0, $max - 1);
8776             }
8777             else {
8778
8779                 # Otherwise fold at an acceptable break char closest to
8780                 # the max length.  Look at just the maximal initial
8781                 # segment of the line
8782                 my $segment = substr($line[$i], 0, $max - 1);
8783                 if ($segment =~
8784                     /^ ( .{$hanging_indent}   # Don't look before the
8785                                               #  indent.
8786                         \ *                   # Don't look in leading
8787                                               #  blanks past the indent
8788                             [^ ] .*           # Find the right-most
8789                         (?:                   #  acceptable break:
8790                             [ \s = ]          # space or equal
8791                             | - (?! [.0-9] )  # or non-unary minus.
8792                         )                     # $1 includes the character
8793                     )/x)
8794                 {
8795                     # Split into the initial part that fits, and remaining
8796                     # part of the input
8797                     $remainder = substr($line[$i], length $1);
8798                     $line[$i] = $1;
8799                     trace $line[$i] if DEBUG && $to_trace;
8800                     trace $remainder if DEBUG && $to_trace;
8801                 }
8802
8803                 # If didn't find a good breaking spot, see if there is a
8804                 # not-so-good breaking spot.  These are just after
8805                 # underscores or where the case changes from lower to
8806                 # upper.  Use \a as a soft hyphen, but give up
8807                 # and don't break the line if there is actually a \a
8808                 # already in the input.  We use an ascii character for the
8809                 # soft-hyphen to avoid any attempt by miniperl to try to
8810                 # access the files that this program is creating.
8811                 elsif ($segment !~ /\a/
8812                        && ($segment =~ s/_/_\a/g
8813                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8814                 {
8815                     # Here were able to find at least one place to insert
8816                     # our substitute soft hyphen.  Find the right-most one
8817                     # and replace it by a real hyphen.
8818                     trace $segment if DEBUG && $to_trace;
8819                     substr($segment,
8820                             rindex($segment, "\a"),
8821                             1) = '-';
8822
8823                     # Then remove the soft hyphen substitutes.
8824                     $segment =~ s/\a//g;
8825                     trace $segment if DEBUG && $to_trace;
8826
8827                     # And split into the initial part that fits, and
8828                     # remainder of the line
8829                     my $pos = rindex($segment, '-');
8830                     $remainder = substr($line[$i], $pos);
8831                     trace $remainder if DEBUG && $to_trace;
8832                     $line[$i] = substr($segment, 0, $pos + 1);
8833                 }
8834             }
8835
8836             # Here we know if we can fold or not.  If we can, $remainder
8837             # is what remains to be processed in the next iteration.
8838             if (defined $remainder) {
8839                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8840
8841                 # Insert the folded remainder of the line as a new element
8842                 # of the array.  (It may still be too long, but we will
8843                 # deal with that next time through the loop.)  Omit any
8844                 # leading space in the remainder.
8845                 $remainder =~ s/^\s+//;
8846                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8847
8848                 # But then indent by whichever is larger of:
8849                 # 1) the leading space on the input line;
8850                 # 2) the hanging indent.
8851                 # This preserves indentation in the original line.
8852                 my $lead = ($leading_space)
8853                             ? length $leading_space
8854                             : $hanging_indent;
8855                 $lead = max($lead, $hanging_indent);
8856                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8857             }
8858         }
8859
8860         # Ready to output the line. Get rid of any trailing space
8861         # And prefix by the required $prefix passed in.
8862         $line[$i] =~ s/\s+$//;
8863         $line[$i] = "$prefix$line[$i]\n";
8864     } # End of looping through all the lines.
8865
8866     return join "", @line;
8867 }
8868
8869 sub property_ref {  # Returns a reference to a property object.
8870     return Property::property_ref(@_);
8871 }
8872
8873 sub force_unlink ($) {
8874     my $filename = shift;
8875     return unless file_exists($filename);
8876     return if CORE::unlink($filename);
8877
8878     # We might need write permission
8879     chmod 0777, $filename;
8880     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
8881     return;
8882 }
8883
8884 sub write ($$@) {
8885     # Given a filename and references to arrays of lines, write the lines of
8886     # each array to the file
8887     # Filename can be given as an arrayref of directory names
8888
8889     return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
8890
8891     my $file  = shift;
8892     my $use_utf8 = shift;
8893
8894     # Get into a single string if an array, and get rid of, in Unix terms, any
8895     # leading '.'
8896     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8897     $file = File::Spec->canonpath($file);
8898
8899     # If has directories, make sure that they all exist
8900     (undef, my $directories, undef) = File::Spec->splitpath($file);
8901     File::Path::mkpath($directories) if $directories && ! -d $directories;
8902
8903     push @files_actually_output, $file;
8904
8905     force_unlink ($file);
8906
8907     my $OUT;
8908     if (not open $OUT, ">", $file) {
8909         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
8910         return;
8911     }
8912
8913     binmode $OUT, ":utf8" if $use_utf8;
8914
8915     while (defined (my $lines_ref = shift)) {
8916         unless (@$lines_ref) {
8917             Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8918         }
8919
8920         print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8921     }
8922     close $OUT or die Carp::my_carp("close '$file' failed: $!");
8923
8924     print "$file written.\n" if $verbosity >= $VERBOSE;
8925
8926     return;
8927 }
8928
8929
8930 sub Standardize($) {
8931     # This converts the input name string into a standardized equivalent to
8932     # use internally.
8933
8934     my $name = shift;
8935     unless (defined $name) {
8936       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
8937       return;
8938     }
8939
8940     # Remove any leading or trailing white space
8941     $name =~ s/^\s+//g;
8942     $name =~ s/\s+$//g;
8943
8944     # Convert interior white space and hyphens into underscores.
8945     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8946
8947     # Capitalize the letter following an underscore, and convert a sequence of
8948     # multiple underscores to a single one
8949     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8950
8951     # And capitalize the first letter, but not for the special cjk ones.
8952     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8953     return $name;
8954 }
8955
8956 sub standardize ($) {
8957     # Returns a lower-cased standardized name, without underscores.  This form
8958     # is chosen so that it can distinguish between any real versus superficial
8959     # Unicode name differences.  It relies on the fact that Unicode doesn't
8960     # have interior underscores, white space, nor dashes in any
8961     # stricter-matched name.  It should not be used on Unicode code point
8962     # names (the Name property), as they mostly, but not always follow these
8963     # rules.
8964
8965     my $name = Standardize(shift);
8966     return if !defined $name;
8967
8968     $name =~ s/ (?<= .) _ (?= . ) //xg;
8969     return lc $name;
8970 }
8971
8972 sub utf8_heavy_name ($$) {
8973     # Returns the name that utf8_heavy.pl will use to find a table.  XXX
8974     # perhaps this function should be placed somewhere, like Heavy.pl so that
8975     # utf8_heavy can use it directly without duplicating code that can get
8976     # out-of sync.
8977
8978     my $table = shift;
8979     my $alias = shift;
8980     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8981
8982     my $property = $table->property;
8983     $property = ($property == $perl)
8984                 ? ""                # 'perl' is never explicitly stated
8985                 : standardize($property->name) . '=';
8986     if ($alias->loose_match) {
8987         return $property . standardize($alias->name);
8988     }
8989     else {
8990         return lc ($property . $alias->name);
8991     }
8992
8993     return;
8994 }
8995
8996 {   # Closure
8997
8998     my $indent_increment = " " x (($debugging_build) ? 2 : 0);
8999     my %already_output;
9000
9001     $main::simple_dumper_nesting = 0;
9002
9003     sub simple_dumper {
9004         # Like Simple Data::Dumper. Good enough for our needs. We can't use
9005         # the real thing as we have to run under miniperl.
9006
9007         # It is designed so that on input it is at the beginning of a line,
9008         # and the final thing output in any call is a trailing ",\n".
9009
9010         my $item = shift;
9011         my $indent = shift;
9012         $indent = "" if ! $debugging_build || ! defined $indent;
9013
9014         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9015
9016         # nesting level is localized, so that as the call stack pops, it goes
9017         # back to the prior value.
9018         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
9019         undef %already_output if $main::simple_dumper_nesting == 0;
9020         $main::simple_dumper_nesting++;
9021         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
9022
9023         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9024
9025         # Determine the indent for recursive calls.
9026         my $next_indent = $indent . $indent_increment;
9027
9028         my $output;
9029         if (! ref $item) {
9030
9031             # Dump of scalar: just output it in quotes if not a number.  To do
9032             # so we must escape certain characters, and therefore need to
9033             # operate on a copy to avoid changing the original
9034             my $copy = $item;
9035             $copy = $UNDEF unless defined $copy;
9036
9037             # Quote non-integers (integers also have optional leading '-')
9038             if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
9039
9040                 # Escape apostrophe and backslash
9041                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
9042                 $copy = "'$copy'";
9043             }
9044             $output = "$indent$copy,\n";
9045         }
9046         else {
9047
9048             # Keep track of cycles in the input, and refuse to infinitely loop
9049             my $addr = do { no overloading; pack 'J', $item; };
9050             if (defined $already_output{$addr}) {
9051                 return "${indent}ALREADY OUTPUT: $item\n";
9052             }
9053             $already_output{$addr} = $item;
9054
9055             if (ref $item eq 'ARRAY') {
9056                 my $using_brackets;
9057                 $output = $indent;
9058                 if ($main::simple_dumper_nesting > 1) {
9059                     $output .= '[';
9060                     $using_brackets = 1;
9061                 }
9062                 else {
9063                     $using_brackets = 0;
9064                 }
9065
9066                 # If the array is empty, put the closing bracket on the same
9067                 # line.  Otherwise, recursively add each array element
9068                 if (@$item == 0) {
9069                     $output .= " ";
9070                 }
9071                 else {
9072                     $output .= "\n";
9073                     for (my $i = 0; $i < @$item; $i++) {
9074
9075                         # Indent array elements one level
9076                         $output .= &simple_dumper($item->[$i], $next_indent);
9077                         next if ! $debugging_build;
9078                         $output =~ s/\n$//;      # Remove any trailing nl so
9079                         $output .= " # [$i]\n";  # as to add a comment giving
9080                                                  # the array index
9081                     }
9082                     $output .= $indent;     # Indent closing ']' to orig level
9083                 }
9084                 $output .= ']' if $using_brackets;
9085                 $output .= ",\n";
9086             }
9087             elsif (ref $item eq 'HASH') {
9088                 my $is_first_line;
9089                 my $using_braces;
9090                 my $body_indent;
9091
9092                 # No surrounding braces at top level
9093                 $output .= $indent;
9094                 if ($main::simple_dumper_nesting > 1) {
9095                     $output .= "{\n";
9096                     $is_first_line = 0;
9097                     $body_indent = $next_indent;
9098                     $next_indent .= $indent_increment;
9099                     $using_braces = 1;
9100                 }
9101                 else {
9102                     $is_first_line = 1;
9103                     $body_indent = $indent;
9104                     $using_braces = 0;
9105                 }
9106
9107                 # Output hashes sorted alphabetically instead of apparently
9108                 # random.  Use caseless alphabetic sort
9109                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
9110                 {
9111                     if ($is_first_line) {
9112                         $is_first_line = 0;
9113                     }
9114                     else {
9115                         $output .= "$body_indent";
9116                     }
9117
9118                     # The key must be a scalar, but this recursive call quotes
9119                     # it
9120                     $output .= &simple_dumper($key);
9121
9122                     # And change the trailing comma and nl to the hash fat
9123                     # comma for clarity, and so the value can be on the same
9124                     # line
9125                     $output =~ s/,\n$/ => /;
9126
9127                     # Recursively call to get the value's dump.
9128                     my $next = &simple_dumper($item->{$key}, $next_indent);
9129
9130                     # If the value is all on one line, remove its indent, so
9131                     # will follow the => immediately.  If it takes more than
9132                     # one line, start it on a new line.
9133                     if ($next !~ /\n.*\n/) {
9134                         $next =~ s/^ *//;
9135                     }
9136                     else {
9137                         $output .= "\n";
9138                     }
9139                     $output .= $next;
9140                 }
9141
9142                 $output .= "$indent},\n" if $using_braces;
9143             }
9144             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
9145                 $output = $indent . ref($item) . "\n";
9146                 # XXX see if blessed
9147             }
9148             elsif ($item->can('dump')) {
9149
9150                 # By convention in this program, objects furnish a 'dump'
9151                 # method.  Since not doing any output at this level, just pass
9152                 # on the input indent
9153                 $output = $item->dump($indent);
9154             }
9155             else {
9156                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
9157             }
9158         }
9159         return $output;
9160     }
9161 }
9162
9163 sub dump_inside_out {
9164     # Dump inside-out hashes in an object's state by converting them to a
9165     # regular hash and then calling simple_dumper on that.
9166
9167     my $object = shift;
9168     my $fields_ref = shift;
9169     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9170
9171     my $addr = do { no overloading; pack 'J', $object; };
9172
9173     my %hash;
9174     foreach my $key (keys %$fields_ref) {
9175         $hash{$key} = $fields_ref->{$key}{$addr};
9176     }
9177
9178     return simple_dumper(\%hash, @_);
9179 }
9180
9181 sub _operator_dot {
9182     # Overloaded '.' method that is common to all packages.  It uses the
9183     # package's stringify method.
9184
9185     my $self = shift;
9186     my $other = shift;
9187     my $reversed = shift;
9188     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9189
9190     $other = "" unless defined $other;
9191
9192     foreach my $which (\$self, \$other) {
9193         next unless ref $$which;
9194         if ($$which->can('_operator_stringify')) {
9195             $$which = $$which->_operator_stringify;
9196         }
9197         else {
9198             my $ref = ref $$which;
9199             my $addr = do { no overloading; pack 'J', $$which; };
9200             $$which = "$ref ($addr)";
9201         }
9202     }
9203     return ($reversed)
9204             ? "$other$self"
9205             : "$self$other";
9206 }
9207
9208 sub _operator_dot_equal {
9209     # Overloaded '.=' method that is common to all packages.
9210
9211     my $self = shift;
9212     my $other = shift;
9213     my $reversed = shift;
9214     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9215
9216     $other = "" unless defined $other;
9217
9218     if ($reversed) {
9219         return $other .= "$self";
9220     }
9221     else {
9222         return "$self" . "$other";
9223     }
9224 }
9225
9226 sub _operator_equal {
9227     # Generic overloaded '==' routine.  To be equal, they must be the exact
9228     # same object
9229
9230     my $self = shift;
9231     my $other = shift;
9232
9233     return 0 unless defined $other;
9234     return 0 unless ref $other;
9235     no overloading;
9236     return $self == $other;
9237 }
9238
9239 sub _operator_not_equal {
9240     my $self = shift;
9241     my $other = shift;
9242
9243     return ! _operator_equal($self, $other);
9244 }
9245
9246 sub process_PropertyAliases($) {
9247     # This reads in the PropertyAliases.txt file, which contains almost all
9248     # the character properties in Unicode and their equivalent aliases:
9249     # scf       ; Simple_Case_Folding         ; sfc
9250     #
9251     # Field 0 is the preferred short name for the property.
9252     # Field 1 is the full name.
9253     # Any succeeding ones are other accepted names.
9254
9255     my $file= shift;
9256     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9257
9258     # This whole file was non-existent in early releases, so use our own
9259     # internal one.
9260     $file->insert_lines(get_old_property_aliases())
9261                                                 if ! -e 'PropertyAliases.txt';
9262
9263     # Add any cjk properties that may have been defined.
9264     $file->insert_lines(@cjk_properties);
9265
9266     while ($file->next_line) {
9267
9268         my @data = split /\s*;\s*/;
9269
9270         my $full = $data[1];
9271
9272         my $this = Property->new($data[0], Full_Name => $full);
9273
9274         # Start looking for more aliases after these two.
9275         for my $i (2 .. @data - 1) {
9276             $this->add_alias($data[$i]);
9277         }
9278
9279     }
9280
9281     my $scf = property_ref("Simple_Case_Folding");
9282     $scf->add_alias("scf");
9283     $scf->add_alias("sfc");
9284
9285     return;
9286 }
9287
9288 sub finish_property_setup {
9289     # Finishes setting up after PropertyAliases.
9290
9291     my $file = shift;
9292     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9293
9294     # This entry was missing from this file in earlier Unicode versions
9295     if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
9296         Property->new('JSN', Full_Name => 'Jamo_Short_Name');
9297     }
9298
9299     # These two properties must be defined in all releases so we can generate
9300     # the tables from them to make regex \X work, but suppress their output so
9301     # aren't application visible prior to releases where they should be
9302     if (! defined property_ref('GCB')) {
9303         Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
9304                       Fate => $PLACEHOLDER);
9305     }
9306     if (! defined property_ref('hst')) {
9307         Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
9308                       Fate => $PLACEHOLDER);
9309     }
9310
9311     # These are used so much, that we set globals for them.
9312     $gc = property_ref('General_Category');
9313     $block = property_ref('Block');
9314     $script = property_ref('Script');
9315
9316     # Perl adds this alias.
9317     $gc->add_alias('Category');
9318
9319     # Unicode::Normalize expects this file with this name and directory.
9320     my $ccc = property_ref('Canonical_Combining_Class');
9321     if (defined $ccc) {
9322         $ccc->set_file('CombiningClass');
9323         $ccc->set_directory(File::Spec->curdir());
9324     }
9325
9326     # These two properties aren't actually used in the core, but unfortunately
9327     # the names just above that are in the core interfere with these, so
9328     # choose different names.  These aren't a problem unless the map tables
9329     # for these files get written out.
9330     my $lowercase = property_ref('Lowercase');
9331     $lowercase->set_file('IsLower') if defined $lowercase;
9332     my $uppercase = property_ref('Uppercase');
9333     $uppercase->set_file('IsUpper') if defined $uppercase;
9334
9335     # Set up the hard-coded default mappings, but only on properties defined
9336     # for this release
9337     foreach my $property (keys %default_mapping) {
9338         my $property_object = property_ref($property);
9339         next if ! defined $property_object;
9340         my $default_map = $default_mapping{$property};
9341         $property_object->set_default_map($default_map);
9342
9343         # A map of <code point> implies the property is string.
9344         if ($property_object->type == $UNKNOWN
9345             && $default_map eq $CODE_POINT)
9346         {
9347             $property_object->set_type($STRING);
9348         }
9349     }
9350
9351     # The following use the Multi_Default class to create objects for
9352     # defaults.
9353
9354     # Bidi class has a complicated default, but the derived file takes care of
9355     # the complications, leaving just 'L'.
9356     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
9357         property_ref('Bidi_Class')->set_default_map('L');
9358     }
9359     else {
9360         my $default;
9361
9362         # The derived file was introduced in 3.1.1.  The values below are
9363         # taken from table 3-8, TUS 3.0
9364         my $default_R =
9365             'my $default = Range_List->new;
9366              $default->add_range(0x0590, 0x05FF);
9367              $default->add_range(0xFB1D, 0xFB4F);'
9368         ;
9369
9370         # The defaults apply only to unassigned characters
9371         $default_R .= '$gc->table("Unassigned") & $default;';
9372
9373         if ($v_version lt v3.0.0) {
9374             $default = Multi_Default->new(R => $default_R, 'L');
9375         }
9376         else {
9377
9378             # AL apparently not introduced until 3.0:  TUS 2.x references are
9379             # not on-line to check it out
9380             my $default_AL =
9381                 'my $default = Range_List->new;
9382                  $default->add_range(0x0600, 0x07BF);
9383                  $default->add_range(0xFB50, 0xFDFF);
9384                  $default->add_range(0xFE70, 0xFEFF);'
9385             ;
9386
9387             # Non-character code points introduced in this release; aren't AL
9388             if ($v_version ge 3.1.0) {
9389                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
9390             }
9391             $default_AL .= '$gc->table("Unassigned") & $default';
9392             $default = Multi_Default->new(AL => $default_AL,
9393                                           R => $default_R,
9394                                           'L');
9395         }
9396         property_ref('Bidi_Class')->set_default_map($default);
9397     }
9398
9399     # Joining type has a complicated default, but the derived file takes care
9400     # of the complications, leaving just 'U' (or Non_Joining), except the file
9401     # is bad in 3.1.0
9402     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
9403         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
9404             property_ref('Joining_Type')->set_default_map('Non_Joining');
9405         }
9406         else {
9407
9408             # Otherwise, there are not one, but two possibilities for the
9409             # missing defaults: T and U.
9410             # The missing defaults that evaluate to T are given by:
9411             # T = Mn + Cf - ZWNJ - ZWJ
9412             # where Mn and Cf are the general category values. In other words,
9413             # any non-spacing mark or any format control character, except
9414             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
9415             # WIDTH JOINER (joining type C).
9416             my $default = Multi_Default->new(
9417                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
9418                'Non_Joining');
9419             property_ref('Joining_Type')->set_default_map($default);
9420         }
9421     }
9422
9423     # Line break has a complicated default in early releases. It is 'Unknown'
9424     # for non-assigned code points; 'AL' for assigned.
9425     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
9426         my $lb = property_ref('Line_Break');
9427         if ($v_version gt 3.2.0) {
9428             $lb->set_default_map('Unknown');
9429         }
9430         else {
9431             my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
9432                                               'AL');
9433             $lb->set_default_map($default);
9434         }
9435
9436         # If has the URS property, make sure that the standard aliases are in
9437         # it, since not in the input tables in some versions.
9438         my $urs = property_ref('Unicode_Radical_Stroke');
9439         if (defined $urs) {
9440             $urs->add_alias('cjkRSUnicode');
9441             $urs->add_alias('kRSUnicode');
9442         }
9443     }
9444
9445     # For backwards compatibility with applications that may read the mapping
9446     # file directly (it was documented in 5.12 and 5.14 as being thusly
9447     # usable), keep it from being adjusted.  (range_size_1 is
9448     # used to force the traditional format.)
9449     if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
9450         $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
9451         $nfkc_cf->set_range_size_1(1);
9452     }
9453     if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
9454         $bmg->set_to_output_map($EXTERNAL_MAP);
9455         $bmg->set_range_size_1(1);
9456     }
9457
9458     property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
9459
9460     return;
9461 }
9462
9463 sub get_old_property_aliases() {
9464     # Returns what would be in PropertyAliases.txt if it existed in very old
9465     # versions of Unicode.  It was derived from the one in 3.2, and pared
9466     # down based on the data that was actually in the older releases.
9467     # An attempt was made to use the existence of files to mean inclusion or
9468     # not of various aliases, but if this was not sufficient, using version
9469     # numbers was resorted to.
9470
9471     my @return;
9472
9473     # These are to be used in all versions (though some are constructed by
9474     # this program if missing)
9475     push @return, split /\n/, <<'END';
9476 bc        ; Bidi_Class
9477 Bidi_M    ; Bidi_Mirrored
9478 cf        ; Case_Folding
9479 ccc       ; Canonical_Combining_Class
9480 dm        ; Decomposition_Mapping
9481 dt        ; Decomposition_Type
9482 gc        ; General_Category
9483 isc       ; ISO_Comment
9484 lc        ; Lowercase_Mapping
9485 na        ; Name
9486 na1       ; Unicode_1_Name
9487 nt        ; Numeric_Type
9488 nv        ; Numeric_Value
9489 scf       ; Simple_Case_Folding
9490 slc       ; Simple_Lowercase_Mapping
9491 stc       ; Simple_Titlecase_Mapping
9492 suc       ; Simple_Uppercase_Mapping
9493 tc        ; Titlecase_Mapping
9494 uc        ; Uppercase_Mapping
9495 END
9496
9497     if (-e 'Blocks.txt') {
9498         push @return, "blk       ; Block\n";
9499     }
9500     if (-e 'ArabicShaping.txt') {
9501         push @return, split /\n/, <<'END';
9502 jg        ; Joining_Group
9503 jt        ; Joining_Type
9504 END
9505     }
9506     if (-e 'PropList.txt') {
9507
9508         # This first set is in the original old-style proplist.
9509         push @return, split /\n/, <<'END';
9510 Bidi_C    ; Bidi_Control
9511 Dash      ; Dash
9512 Dia       ; Diacritic
9513 Ext       ; Extender
9514 Hex       ; Hex_Digit
9515 Hyphen    ; Hyphen
9516 IDC       ; ID_Continue
9517 Ideo      ; Ideographic
9518 Join_C    ; Join_Control
9519 Math      ; Math
9520 QMark     ; Quotation_Mark
9521 Term      ; Terminal_Punctuation
9522 WSpace    ; White_Space
9523 END
9524         # The next sets were added later
9525         if ($v_version ge v3.0.0) {
9526             push @return, split /\n/, <<'END';
9527 Upper     ; Uppercase
9528 Lower     ; Lowercase
9529 END
9530         }
9531         if ($v_version ge v3.0.1) {
9532             push @return, split /\n/, <<'END';
9533 NChar     ; Noncharacter_Code_Point
9534 END
9535         }
9536         # The next sets were added in the new-style
9537         if ($v_version ge v3.1.0) {
9538             push @return, split /\n/, <<'END';
9539 OAlpha    ; Other_Alphabetic
9540 OLower    ; Other_Lowercase
9541 OMath     ; Other_Math
9542 OUpper    ; Other_Uppercase
9543 END
9544         }
9545         if ($v_version ge v3.1.1) {
9546             push @return, "AHex      ; ASCII_Hex_Digit\n";
9547         }
9548     }
9549     if (-e 'EastAsianWidth.txt') {
9550         push @return, "ea        ; East_Asian_Width\n";
9551     }
9552     if (-e 'CompositionExclusions.txt') {
9553         push @return, "CE        ; Composition_Exclusion\n";
9554     }
9555     if (-e 'LineBreak.txt') {
9556         push @return, "lb        ; Line_Break\n";
9557     }
9558     if (-e 'BidiMirroring.txt') {
9559         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
9560     }
9561     if (-e 'Scripts.txt') {
9562         push @return, "sc        ; Script\n";
9563     }
9564     if (-e 'DNormalizationProps.txt') {
9565         push @return, split /\n/, <<'END';
9566 Comp_Ex   ; Full_Composition_Exclusion
9567 FC_NFKC   ; FC_NFKC_Closure
9568 NFC_QC    ; NFC_Quick_Check
9569 NFD_QC    ; NFD_Quick_Check
9570 NFKC_QC   ; NFKC_Quick_Check
9571 NFKD_QC   ; NFKD_Quick_Check
9572 XO_NFC    ; Expands_On_NFC
9573 XO_NFD    ; Expands_On_NFD
9574 XO_NFKC   ; Expands_On_NFKC
9575 XO_NFKD   ; Expands_On_NFKD
9576 END
9577     }
9578     if (-e 'DCoreProperties.txt') {
9579         push @return, split /\n/, <<'END';
9580 Alpha     ; Alphabetic
9581 IDS       ; ID_Start
9582 XIDC      ; XID_Continue
9583 XIDS      ; XID_Start
9584 END
9585         # These can also appear in some versions of PropList.txt
9586         push @return, "Lower     ; Lowercase\n"
9587                                     unless grep { $_ =~ /^Lower\b/} @return;
9588         push @return, "Upper     ; Uppercase\n"
9589                                     unless grep { $_ =~ /^Upper\b/} @return;
9590     }
9591
9592     # This flag requires the DAge.txt file to be copied into the directory.
9593     if (DEBUG && $compare_versions) {
9594         push @return, 'age       ; Age';
9595     }
9596
9597     return @return;
9598 }
9599
9600 sub process_PropValueAliases {
9601     # This file contains values that properties look like:
9602     # bc ; AL        ; Arabic_Letter
9603     # blk; n/a       ; Greek_And_Coptic                 ; Greek
9604     #
9605     # Field 0 is the property.
9606     # Field 1 is the short name of a property value or 'n/a' if no
9607     #                short name exists;
9608     # Field 2 is the full property value name;
9609     # Any other fields are more synonyms for the property value.
9610     # Purely numeric property values are omitted from the file; as are some
9611     # others, fewer and fewer in later releases
9612
9613     # Entries for the ccc property have an extra field before the
9614     # abbreviation:
9615     # ccc;   0; NR   ; Not_Reordered
9616     # It is the numeric value that the names are synonyms for.
9617
9618     # There are comment entries for values missing from this file:
9619     # # @missing: 0000..10FFFF; ISO_Comment; <none>
9620     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
9621
9622     my $file= shift;
9623     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9624
9625     # This whole file was non-existent in early releases, so use our own
9626     # internal one if necessary.
9627     if (! -e 'PropValueAliases.txt') {
9628         $file->insert_lines(get_old_property_value_aliases());
9629     }
9630
9631     if ($v_version lt 4.0.0) {
9632         $file->insert_lines(split /\n/, <<'END'
9633 hst; L                                ; Leading_Jamo
9634 hst; LV                               ; LV_Syllable
9635 hst; LVT                              ; LVT_Syllable
9636 hst; NA                               ; Not_Applicable
9637 hst; T                                ; Trailing_Jamo
9638 hst; V                                ; Vowel_Jamo
9639 END
9640         );
9641     }
9642     if ($v_version lt 4.1.0) {
9643         $file->insert_lines(split /\n/, <<'END'
9644 GCB; CN                               ; Control
9645 GCB; CR                               ; CR
9646 GCB; EX                               ; Extend
9647 GCB; L                                ; L
9648 GCB; LF                               ; LF
9649 GCB; LV                               ; LV
9650 GCB; LVT                              ; LVT
9651 GCB; T                                ; T
9652 GCB; V                                ; V
9653 GCB; XX                               ; Other
9654 END
9655         );
9656     }
9657
9658
9659     # Add any explicit cjk values
9660     $file->insert_lines(@cjk_property_values);
9661
9662     # This line is used only for testing the code that checks for name
9663     # conflicts.  There is a script Inherited, and when this line is executed
9664     # it causes there to be a name conflict with the 'Inherited' that this
9665     # program generates for this block property value
9666     #$file->insert_lines('blk; n/a; Herited');
9667
9668
9669     # Process each line of the file ...
9670     while ($file->next_line) {
9671
9672         # Fix typo in input file
9673         s/CCC133/CCC132/g if $v_version eq v6.1.0;
9674
9675         my ($property, @data) = split /\s*;\s*/;
9676
9677         # The ccc property has an extra field at the beginning, which is the
9678         # numeric value.  Move it to be after the other two, mnemonic, fields,
9679         # so that those will be used as the property value's names, and the
9680         # number will be an extra alias.  (Rightmost splice removes field 1-2,
9681         # returning them in a slice; left splice inserts that before anything,
9682         # thus shifting the former field 0 to after them.)
9683         splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
9684
9685         # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
9686         # there is no short name, use the full one in element 1
9687         if ($data[0] eq "n/a") {
9688             $data[0] = $data[1];
9689         }
9690         elsif ($data[0] ne $data[1]
9691                && standardize($data[0]) eq standardize($data[1])
9692                && $data[1] !~ /[[:upper:]]/)
9693         {
9694             # Also, there is a bug in the file in which "n/a" is omitted, and
9695             # the two fields are identical except for case, and the full name
9696             # is all lower case.  Copy the "short" name unto the full one to
9697             # give it some upper case.
9698
9699             $data[1] = $data[0];
9700         }
9701
9702         # Earlier releases had the pseudo property 'qc' that should expand to
9703         # the ones that replace it below.
9704         if ($property eq 'qc') {
9705             if (lc $data[0] eq 'y') {
9706                 $file->insert_lines('NFC_QC; Y      ; Yes',
9707                                     'NFD_QC; Y      ; Yes',
9708                                     'NFKC_QC; Y     ; Yes',
9709                                     'NFKD_QC; Y     ; Yes',
9710                                     );
9711             }
9712             elsif (lc $data[0] eq 'n') {
9713                 $file->insert_lines('NFC_QC; N      ; No',
9714                                     'NFD_QC; N      ; No',
9715                                     'NFKC_QC; N     ; No',
9716                                     'NFKD_QC; N     ; No',
9717                                     );
9718             }
9719             elsif (lc $data[0] eq 'm') {
9720                 $file->insert_lines('NFC_QC; M      ; Maybe',
9721                                     'NFKC_QC; M     ; Maybe',
9722                                     );
9723             }
9724             else {
9725                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
9726             }
9727             next;
9728         }
9729
9730         # The first field is the short name, 2nd is the full one.
9731         my $property_object = property_ref($property);
9732         my $table = $property_object->add_match_table($data[0],
9733                                                 Full_Name => $data[1]);
9734
9735         # Start looking for more aliases after these two.
9736         for my $i (2 .. @data - 1) {
9737             $table->add_alias($data[$i]);
9738         }
9739     } # End of looping through the file
9740
9741     # As noted in the comments early in the program, it generates tables for
9742     # the default values for all releases, even those for which the concept
9743     # didn't exist at the time.  Here we add those if missing.
9744     my $age = property_ref('age');
9745     if (defined $age && ! defined $age->table('Unassigned')) {
9746         $age->add_match_table('Unassigned');
9747     }
9748     $block->add_match_table('No_Block') if -e 'Blocks.txt'
9749                                     && ! defined $block->table('No_Block');
9750
9751
9752     # Now set the default mappings of the properties from the file.  This is
9753     # done after the loop because a number of properties have only @missings
9754     # entries in the file, and may not show up until the end.
9755     my @defaults = $file->get_missings;
9756     foreach my $default_ref (@defaults) {
9757         my $default = $default_ref->[0];
9758         my $property = property_ref($default_ref->[1]);
9759         $property->set_default_map($default);
9760     }
9761     return;
9762 }
9763
9764 sub get_old_property_value_aliases () {
9765     # Returns what would be in PropValueAliases.txt if it existed in very old
9766     # versions of Unicode.  It was derived from the one in 3.2, and pared
9767     # down.  An attempt was made to use the existence of files to mean
9768     # inclusion or not of various aliases, but if this was not sufficient,
9769     # using version numbers was resorted to.
9770
9771     my @return = split /\n/, <<'END';
9772 bc ; AN        ; Arabic_Number
9773 bc ; B         ; Paragraph_Separator
9774 bc ; CS        ; Common_Separator
9775 bc ; EN        ; European_Number
9776 bc ; ES        ; European_Separator
9777 bc ; ET        ; European_Terminator
9778 bc ; L         ; Left_To_Right
9779 bc ; ON        ; Other_Neutral
9780 bc ; R         ; Right_To_Left
9781 bc ; WS        ; White_Space
9782
9783 Bidi_M; N; No; F; False
9784 Bidi_M; Y; Yes; T; True
9785
9786 # The standard combining classes are very much different in v1, so only use
9787 # ones that look right (not checked thoroughly)
9788 ccc;   0; NR   ; Not_Reordered
9789 ccc;   1; OV   ; Overlay
9790 ccc;   7; NK   ; Nukta
9791 ccc;   8; KV   ; Kana_Voicing
9792 ccc;   9; VR   ; Virama
9793 ccc; 202; ATBL ; Attached_Below_Left
9794 ccc; 216; ATAR ; Attached_Above_Right
9795 ccc; 218; BL   ; Below_Left
9796 ccc; 220; B    ; Below
9797 ccc; 222; BR   ; Below_Right
9798 ccc; 224; L    ; Left
9799 ccc; 228; AL   ; Above_Left
9800 ccc; 230; A    ; Above
9801 ccc; 232; AR   ; Above_Right
9802 ccc; 234; DA   ; Double_Above
9803
9804 dt ; can       ; canonical
9805 dt ; enc       ; circle
9806 dt ; fin       ; final
9807 dt ; font      ; font
9808 dt ; fra       ; fraction
9809 dt ; init      ; initial
9810 dt ; iso       ; isolated
9811 dt ; med       ; medial
9812 dt ; n/a       ; none
9813 dt ; nb        ; noBreak
9814 dt ; sqr       ; square
9815 dt ; sub       ; sub
9816 dt ; sup       ; super
9817
9818 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
9819 gc ; Cc        ; Control
9820 gc ; Cn        ; Unassigned
9821 gc ; Co        ; Private_Use
9822 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
9823 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
9824 gc ; Ll        ; Lowercase_Letter
9825 gc ; Lm        ; Modifier_Letter
9826 gc ; Lo        ; Other_Letter
9827 gc ; Lu        ; Uppercase_Letter
9828 gc ; M         ; Mark                             # Mc | Me | Mn
9829 gc ; Mc        ; Spacing_Mark
9830 gc ; Mn        ; Nonspacing_Mark
9831 gc ; N         ; Number                           # Nd | Nl | No
9832 gc ; Nd        ; Decimal_Number
9833 gc ; No        ; Other_Number
9834 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
9835 gc ; Pd        ; Dash_Punctuation
9836 gc ; Pe        ; Close_Punctuation
9837 gc ; Po        ; Other_Punctuation
9838 gc ; Ps        ; Open_Punctuation
9839 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
9840 gc ; Sc        ; Currency_Symbol
9841 gc ; Sm        ; Math_Symbol
9842 gc ; So        ; Other_Symbol
9843 gc ; Z         ; Separator                        # Zl | Zp | Zs
9844 gc ; Zl        ; Line_Separator
9845 gc ; Zp        ; Paragraph_Separator
9846 gc ; Zs        ; Space_Separator
9847
9848 nt ; de        ; Decimal
9849 nt ; di        ; Digit
9850 nt ; n/a       ; None
9851 nt ; nu        ; Numeric
9852 END
9853
9854     if (-e 'ArabicShaping.txt') {
9855         push @return, split /\n/, <<'END';
9856 jg ; n/a       ; AIN
9857 jg ; n/a       ; ALEF
9858 jg ; n/a       ; DAL
9859 jg ; n/a       ; GAF
9860 jg ; n/a       ; LAM
9861 jg ; n/a       ; MEEM
9862 jg ; n/a       ; NO_JOINING_GROUP
9863 jg ; n/a       ; NOON
9864 jg ; n/a       ; QAF
9865 jg ; n/a       ; SAD
9866 jg ; n/a       ; SEEN
9867 jg ; n/a       ; TAH
9868 jg ; n/a       ; WAW
9869
9870 jt ; C         ; Join_Causing
9871 jt ; D         ; Dual_Joining
9872 jt ; L         ; Left_Joining
9873 jt ; R         ; Right_Joining
9874 jt ; U         ; Non_Joining
9875 jt ; T         ; Transparent
9876 END
9877         if ($v_version ge v3.0.0) {
9878             push @return, split /\n/, <<'END';
9879 jg ; n/a       ; ALAPH
9880 jg ; n/a       ; BEH
9881 jg ; n/a       ; BETH
9882 jg ; n/a       ; DALATH_RISH
9883 jg ; n/a       ; E
9884 jg ; n/a       ; FEH
9885 jg ; n/a       ; FINAL_SEMKATH
9886 jg ; n/a       ; GAMAL
9887 jg ; n/a       ; HAH
9888 jg ; n/a       ; HAMZA_ON_HEH_GOAL
9889 jg ; n/a       ; HE
9890 jg ; n/a       ; HEH
9891 jg ; n/a       ; HEH_GOAL
9892 jg ; n/a       ; HETH
9893 jg ; n/a       ; KAF
9894 jg ; n/a       ; KAPH
9895 jg ; n/a       ; KNOTTED_HEH
9896 jg ; n/a       ; LAMADH
9897 jg ; n/a       ; MIM
9898 jg ; n/a       ; NUN
9899 jg ; n/a       ; PE
9900 jg ; n/a       ; QAPH
9901 jg ; n/a       ; REH
9902 jg ; n/a       ; REVERSED_PE
9903 jg ; n/a       ; SADHE
9904 jg ; n/a       ; SEMKATH
9905 jg ; n/a       ; SHIN
9906 jg ; n/a       ; SWASH_KAF
9907 jg ; n/a       ; TAW
9908 jg ; n/a       ; TEH_MARBUTA
9909 jg ; n/a       ; TETH
9910 jg ; n/a       ; YEH
9911 jg ; n/a       ; YEH_BARREE
9912 jg ; n/a       ; YEH_WITH_TAIL
9913 jg ; n/a       ; YUDH
9914 jg ; n/a       ; YUDH_HE
9915 jg ; n/a       ; ZAIN
9916 END
9917         }
9918     }
9919
9920
9921     if (-e 'EastAsianWidth.txt') {
9922         push @return, split /\n/, <<'END';
9923 ea ; A         ; Ambiguous
9924 ea ; F         ; Fullwidth
9925 ea ; H         ; Halfwidth
9926 ea ; N         ; Neutral
9927 ea ; Na        ; Narrow
9928 ea ; W         ; Wide
9929 END
9930     }
9931
9932     if (-e 'LineBreak.txt') {
9933         push @return, split /\n/, <<'END';
9934 lb ; AI        ; Ambiguous
9935 lb ; AL        ; Alphabetic
9936 lb ; B2        ; Break_Both
9937 lb ; BA        ; Break_After
9938 lb ; BB        ; Break_Before
9939 lb ; BK        ; Mandatory_Break
9940 lb ; CB        ; Contingent_Break
9941 lb ; CL        ; Close_Punctuation
9942 lb ; CM        ; Combining_Mark
9943 lb ; CR        ; Carriage_Return
9944 lb ; EX        ; Exclamation
9945 lb ; GL        ; Glue
9946 lb ; HY        ; Hyphen
9947 lb ; ID        ; Ideographic
9948 lb ; IN        ; Inseperable
9949 lb ; IS        ; Infix_Numeric
9950 lb ; LF        ; Line_Feed
9951 lb ; NS        ; Nonstarter
9952 lb ; NU        ; Numeric
9953 lb ; OP        ; Open_Punctuation
9954 lb ; PO        ; Postfix_Numeric
9955 lb ; PR        ; Prefix_Numeric
9956 lb ; QU        ; Quotation
9957 lb ; SA        ; Complex_Context
9958 lb ; SG        ; Surrogate
9959 lb ; SP        ; Space
9960 lb ; SY        ; Break_Symbols
9961 lb ; XX        ; Unknown
9962 lb ; ZW        ; ZWSpace
9963 END
9964     }
9965
9966     if (-e 'DNormalizationProps.txt') {
9967         push @return, split /\n/, <<'END';
9968 qc ; M         ; Maybe
9969 qc ; N         ; No
9970 qc ; Y         ; Yes
9971 END
9972     }
9973
9974     if (-e 'Scripts.txt') {
9975         push @return, split /\n/, <<'END';
9976 sc ; Arab      ; Arabic
9977 sc ; Armn      ; Armenian
9978 sc ; Beng      ; Bengali
9979 sc ; Bopo      ; Bopomofo
9980 sc ; Cans      ; Canadian_Aboriginal
9981 sc ; Cher      ; Cherokee
9982 sc ; Cyrl      ; Cyrillic
9983 sc ; Deva      ; Devanagari
9984 sc ; Dsrt      ; Deseret
9985 sc ; Ethi      ; Ethiopic
9986 sc ; Geor      ; Georgian
9987 sc ; Goth      ; Gothic
9988 sc ; Grek      ; Greek
9989 sc ; Gujr      ; Gujarati
9990 sc ; Guru      ; Gurmukhi
9991 sc ; Hang      ; Hangul
9992 sc ; Hani      ; Han
9993 sc ; Hebr      ; Hebrew
9994 sc ; Hira      ; Hiragana
9995 sc ; Ital      ; Old_Italic
9996 sc ; Kana      ; Katakana
9997 sc ; Khmr      ; Khmer
9998 sc ; Knda      ; Kannada
9999 sc ; Laoo      ; Lao
10000 sc ; Latn      ; Latin
10001 sc ; Mlym      ; Malayalam
10002 sc ; Mong      ; Mongolian
10003 sc ; Mymr      ; Myanmar
10004 sc ; Ogam      ; Ogham
10005 sc ; Orya      ; Oriya
10006 sc ; Qaai      ; Inherited
10007 sc ; Runr      ; Runic
10008 sc ; Sinh      ; Sinhala
10009 sc ; Syrc      ; Syriac
10010 sc ; Taml      ; Tamil
10011 sc ; Telu      ; Telugu
10012 sc ; Thaa      ; Thaana
10013 sc ; Thai      ; Thai
10014 sc ; Tibt      ; Tibetan
10015 sc ; Yiii      ; Yi
10016 sc ; Zyyy      ; Common
10017 END
10018     }
10019
10020     if ($v_version ge v2.0.0) {
10021         push @return, split /\n/, <<'END';
10022 dt ; com       ; compat
10023 dt ; nar       ; narrow
10024 dt ; sml       ; small
10025 dt ; vert      ; vertical
10026 dt ; wide      ; wide
10027
10028 gc ; Cf        ; Format
10029 gc ; Cs        ; Surrogate
10030 gc ; Lt        ; Titlecase_Letter
10031 gc ; Me        ; Enclosing_Mark
10032 gc ; Nl        ; Letter_Number
10033 gc ; Pc        ; Connector_Punctuation
10034 gc ; Sk        ; Modifier_Symbol
10035 END
10036     }
10037     if ($v_version ge v2.1.2) {
10038         push @return, "bc ; S         ; Segment_Separator\n";
10039     }
10040     if ($v_version ge v2.1.5) {
10041         push @return, split /\n/, <<'END';
10042 gc ; Pf        ; Final_Punctuation
10043 gc ; Pi        ; Initial_Punctuation
10044 END
10045     }
10046     if ($v_version ge v2.1.8) {
10047         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
10048     }
10049
10050     if ($v_version ge v3.0.0) {
10051         push @return, split /\n/, <<'END';
10052 bc ; AL        ; Arabic_Letter
10053 bc ; BN        ; Boundary_Neutral
10054 bc ; LRE       ; Left_To_Right_Embedding
10055 bc ; LRO       ; Left_To_Right_Override
10056 bc ; NSM       ; Nonspacing_Mark
10057 bc ; PDF       ; Pop_Directional_Format
10058 bc ; RLE       ; Right_To_Left_Embedding
10059 bc ; RLO       ; Right_To_Left_Override
10060
10061 ccc; 233; DB   ; Double_Below
10062 END
10063     }
10064
10065     if ($v_version ge v3.1.0) {
10066         push @return, "ccc; 226; R    ; Right\n";
10067     }
10068
10069     return @return;
10070 }
10071
10072 sub process_NormalizationsTest {
10073
10074     # Each line looks like:
10075     #      source code point; NFC; NFD; NFKC; NFKD
10076     # e.g.
10077     #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
10078
10079     my $file= shift;
10080     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10081
10082     # Process each line of the file ...
10083     while ($file->next_line) {
10084
10085         next if /^@/;
10086
10087         my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
10088
10089         foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
10090             $$var = pack "U0U*", map { hex } split " ", $$var;
10091             $$var =~ s/(\\)/$1$1/g;
10092         }
10093
10094         push @normalization_tests,
10095                 "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
10096     } # End of looping through the file
10097 }
10098
10099 sub output_perl_charnames_line ($$) {
10100
10101     # Output the entries in Perl_charnames specially, using 5 digits instead
10102     # of four.  This makes the entries a constant length, and simplifies
10103     # charnames.pm which this table is for.  Unicode can have 6 digit
10104     # ordinals, but they are all private use or noncharacters which do not
10105     # have names, so won't be in this table.
10106
10107     return sprintf "%05X\t%s\n", $_[0], $_[1];
10108 }
10109
10110 { # Closure
10111     # This is used to store the range list of all the code points usable when
10112     # the little used $compare_versions feature is enabled.
10113     my $compare_versions_range_list;
10114
10115     # These are constants to the $property_info hash in this subroutine, to
10116     # avoid using a quoted-string which might have a typo.
10117     my $TYPE  = 'type';
10118     my $DEFAULT_MAP = 'default_map';
10119     my $DEFAULT_TABLE = 'default_table';
10120     my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
10121     my $MISSINGS = 'missings';
10122
10123     sub process_generic_property_file {
10124         # This processes a file containing property mappings and puts them
10125         # into internal map tables.  It should be used to handle any property
10126         # files that have mappings from a code point or range thereof to
10127         # something else.  This means almost all the UCD .txt files.
10128         # each_line_handlers() should be set to adjust the lines of these
10129         # files, if necessary, to what this routine understands:
10130         #
10131         # 0374          ; NFD_QC; N
10132         # 003C..003E    ; Math
10133         #
10134         # the fields are: "codepoint-range ; property; map"
10135         #
10136         # meaning the codepoints in the range all have the value 'map' under
10137         # 'property'.
10138         # Beginning and trailing white space in each field are not significant.
10139         # Note there is not a trailing semi-colon in the above.  A trailing
10140         # semi-colon means the map is a null-string.  An omitted map, as
10141         # opposed to a null-string, is assumed to be 'Y', based on Unicode
10142         # table syntax.  (This could have been hidden from this routine by
10143         # doing it in the $file object, but that would require parsing of the
10144         # line there, so would have to parse it twice, or change the interface
10145         # to pass this an array.  So not done.)
10146         #
10147         # The map field may begin with a sequence of commands that apply to
10148         # this range.  Each such command begins and ends with $CMD_DELIM.
10149         # These are used to indicate, for example, that the mapping for a
10150         # range has a non-default type.
10151         #
10152         # This loops through the file, calling its next_line() method, and
10153         # then taking the map and adding it to the property's table.
10154         # Complications arise because any number of properties can be in the
10155         # file, in any order, interspersed in any way.  The first time a
10156         # property is seen, it gets information about that property and
10157         # caches it for quick retrieval later.  It also normalizes the maps
10158         # so that only one of many synonyms is stored.  The Unicode input
10159         # files do use some multiple synonyms.
10160
10161         my $file = shift;
10162         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10163
10164         my %property_info;               # To keep track of what properties
10165                                          # have already had entries in the
10166                                          # current file, and info about each,
10167                                          # so don't have to recompute.
10168         my $property_name;               # property currently being worked on
10169         my $property_type;               # and its type
10170         my $previous_property_name = ""; # name from last time through loop
10171         my $property_object;             # pointer to the current property's
10172                                          # object
10173         my $property_addr;               # the address of that object
10174         my $default_map;                 # the string that code points missing
10175                                          # from the file map to
10176         my $default_table;               # For non-string properties, a
10177                                          # reference to the match table that
10178                                          # will contain the list of code
10179                                          # points that map to $default_map.
10180
10181         # Get the next real non-comment line
10182         LINE:
10183         while ($file->next_line) {
10184
10185             # Default replacement type; means that if parts of the range have
10186             # already been stored in our tables, the new map overrides them if
10187             # they differ more than cosmetically
10188             my $replace = $IF_NOT_EQUIVALENT;
10189             my $map_type;            # Default type for the map of this range
10190
10191             #local $to_trace = 1 if main::DEBUG;
10192             trace $_ if main::DEBUG && $to_trace;
10193
10194             # Split the line into components
10195             my ($range, $property_name, $map, @remainder)
10196                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10197
10198             # If more or less on the line than we are expecting, warn and skip
10199             # the line
10200             if (@remainder) {
10201                 $file->carp_bad_line('Extra fields');
10202                 next LINE;
10203             }
10204             elsif ( ! defined $property_name) {
10205                 $file->carp_bad_line('Missing property');
10206                 next LINE;
10207             }
10208
10209             # Examine the range.
10210             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
10211             {
10212                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
10213                 next LINE;
10214             }
10215             my $low = hex $1;
10216             my $high = (defined $2) ? hex $2 : $low;
10217
10218             # For the very specialized case of comparing two Unicode
10219             # versions...
10220             if (DEBUG && $compare_versions) {
10221                 if ($property_name eq 'Age') {
10222
10223                     # Only allow code points at least as old as the version
10224                     # specified.
10225                     my $age = pack "C*", split(/\./, $map);        # v string
10226                     next LINE if $age gt $compare_versions;
10227                 }
10228                 else {
10229
10230                     # Again, we throw out code points younger than those of
10231                     # the specified version.  By now, the Age property is
10232                     # populated.  We use the intersection of each input range
10233                     # with this property to find what code points in it are
10234                     # valid.   To do the intersection, we have to convert the
10235                     # Age property map to a Range_list.  We only have to do
10236                     # this once.
10237                     if (! defined $compare_versions_range_list) {
10238                         my $age = property_ref('Age');
10239                         if (! -e 'DAge.txt') {
10240                             croak "Need to have 'DAge.txt' file to do version comparison";
10241                         }
10242                         elsif ($age->count == 0) {
10243                             croak "The 'Age' table is empty, but its file exists";
10244                         }
10245                         $compare_versions_range_list
10246                                         = Range_List->new(Initialize => $age);
10247                     }
10248
10249                     # An undefined map is always 'Y'
10250                     $map = 'Y' if ! defined $map;
10251
10252                     # Calculate the intersection of the input range with the
10253                     # code points that are known in the specified version
10254                     my @ranges = ($compare_versions_range_list
10255                                   & Range->new($low, $high))->ranges;
10256
10257                     # If the intersection is empty, throw away this range
10258                     next LINE unless @ranges;
10259
10260                     # Only examine the first range this time through the loop.
10261                     my $this_range = shift @ranges;
10262
10263                     # Put any remaining ranges in the queue to be processed
10264                     # later.  Note that there is unnecessary work here, as we
10265                     # will do the intersection again for each of these ranges
10266                     # during some future iteration of the LINE loop, but this
10267                     # code is not used in production.  The later intersections
10268                     # are guaranteed to not splinter, so this will not become
10269                     # an infinite loop.
10270                     my $line = join ';', $property_name, $map;
10271                     foreach my $range (@ranges) {
10272                         $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
10273                                                             $range->start,
10274                                                             $range->end,
10275                                                             $line));
10276                     }
10277
10278                     # And process the first range, like any other.
10279                     $low = $this_range->start;
10280                     $high = $this_range->end;
10281                 }
10282             } # End of $compare_versions
10283
10284             # If changing to a new property, get the things constant per
10285             # property
10286             if ($previous_property_name ne $property_name) {
10287
10288                 $property_object = property_ref($property_name);
10289                 if (! defined $property_object) {
10290                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
10291                     next LINE;
10292                 }
10293                 { no overloading; $property_addr = pack 'J', $property_object; }
10294
10295                 # Defer changing names until have a line that is acceptable
10296                 # (the 'next' statement above means is unacceptable)
10297                 $previous_property_name = $property_name;
10298
10299                 # If not the first time for this property, retrieve info about
10300                 # it from the cache
10301                 if (defined ($property_info{$property_addr}{$TYPE})) {
10302                     $property_type = $property_info{$property_addr}{$TYPE};
10303                     $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
10304                     $map_type
10305                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
10306                     $default_table
10307                             = $property_info{$property_addr}{$DEFAULT_TABLE};
10308                 }
10309                 else {
10310
10311                     # Here, is the first time for this property.  Set up the
10312                     # cache.
10313                     $property_type = $property_info{$property_addr}{$TYPE}
10314                                    = $property_object->type;
10315                     $map_type
10316                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
10317                         = $property_object->pseudo_map_type;
10318
10319                     # The Unicode files are set up so that if the map is not
10320                     # defined, it is a binary property
10321                     if (! defined $map && $property_type != $BINARY) {
10322                         if ($property_type != $UNKNOWN
10323                             && $property_type != $NON_STRING)
10324                         {
10325                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
10326                         }
10327                         else {
10328                             $property_object->set_type($BINARY);
10329                             $property_type
10330                                 = $property_info{$property_addr}{$TYPE}
10331                                 = $BINARY;
10332                         }
10333                     }
10334
10335                     # Get any @missings default for this property.  This
10336                     # should precede the first entry for the property in the
10337                     # input file, and is located in a comment that has been
10338                     # stored by the Input_file class until we access it here.
10339                     # It's possible that there is more than one such line
10340                     # waiting for us; collect them all, and parse
10341                     my @missings_list = $file->get_missings
10342                                             if $file->has_missings_defaults;
10343                     foreach my $default_ref (@missings_list) {
10344                         my $default = $default_ref->[0];
10345                         my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
10346
10347                         # For string properties, the default is just what the
10348                         # file says, but non-string properties should already
10349                         # have set up a table for the default property value;
10350                         # use the table for these, so can resolve synonyms
10351                         # later to a single standard one.
10352                         if ($property_type == $STRING
10353                             || $property_type == $UNKNOWN)
10354                         {
10355                             $property_info{$addr}{$MISSINGS} = $default;
10356                         }
10357                         else {
10358                             $property_info{$addr}{$MISSINGS}
10359                                         = $property_object->table($default);
10360                         }
10361                     }
10362
10363                     # Finished storing all the @missings defaults in the input
10364                     # file so far.  Get the one for the current property.
10365                     my $missings = $property_info{$property_addr}{$MISSINGS};
10366
10367                     # But we likely have separately stored what the default
10368                     # should be.  (This is to accommodate versions of the
10369                     # standard where the @missings lines are absent or
10370                     # incomplete.)  Hopefully the two will match.  But check
10371                     # it out.
10372                     $default_map = $property_object->default_map;
10373
10374                     # If the map is a ref, it means that the default won't be
10375                     # processed until later, so undef it, so next few lines
10376                     # will redefine it to something that nothing will match
10377                     undef $default_map if ref $default_map;
10378
10379                     # Create a $default_map if don't have one; maybe a dummy
10380                     # that won't match anything.
10381                     if (! defined $default_map) {
10382
10383                         # Use any @missings line in the file.
10384                         if (defined $missings) {
10385                             if (ref $missings) {
10386                                 $default_map = $missings->full_name;
10387                                 $default_table = $missings;
10388                             }
10389                             else {
10390                                 $default_map = $missings;
10391                             }
10392
10393                             # And store it with the property for outside use.
10394                             $property_object->set_default_map($default_map);
10395                         }
10396                         else {
10397
10398                             # Neither an @missings nor a default map.  Create
10399                             # a dummy one, so won't have to test definedness
10400                             # in the main loop.
10401                             $default_map = '_Perl This will never be in a file
10402                                             from Unicode';
10403                         }
10404                     }
10405
10406                     # Here, we have $default_map defined, possibly in terms of
10407                     # $missings, but maybe not, and possibly is a dummy one.
10408                     if (defined $missings) {
10409
10410                         # Make sure there is no conflict between the two.
10411                         # $missings has priority.
10412                         if (ref $missings) {
10413                             $default_table
10414                                         = $property_object->table($default_map);
10415                             if (! defined $default_table
10416                                 || $default_table != $missings)
10417                             {
10418                                 if (! defined $default_table) {
10419                                     $default_table = $UNDEF;
10420                                 }
10421                                 $file->carp_bad_line(<<END
10422 The \@missings line for $property_name in $file says that missings default to
10423 $missings, but we expect it to be $default_table.  $missings used.
10424 END
10425                                 );
10426                                 $default_table = $missings;
10427                                 $default_map = $missings->full_name;
10428                             }
10429                             $property_info{$property_addr}{$DEFAULT_TABLE}
10430                                                         = $default_table;
10431                         }
10432                         elsif ($default_map ne $missings) {
10433                             $file->carp_bad_line(<<END
10434 The \@missings line for $property_name in $file says that missings default to
10435 $missings, but we expect it to be $default_map.  $missings used.
10436 END
10437                             );
10438                             $default_map = $missings;
10439                         }
10440                     }
10441
10442                     $property_info{$property_addr}{$DEFAULT_MAP}
10443                                                     = $default_map;
10444
10445                     # If haven't done so already, find the table corresponding
10446                     # to this map for non-string properties.
10447                     if (! defined $default_table
10448                         && $property_type != $STRING
10449                         && $property_type != $UNKNOWN)
10450                     {
10451                         $default_table = $property_info{$property_addr}
10452                                                         {$DEFAULT_TABLE}
10453                                     = $property_object->table($default_map);
10454                     }
10455                 } # End of is first time for this property
10456             } # End of switching properties.
10457
10458             # Ready to process the line.
10459             # The Unicode files are set up so that if the map is not defined,
10460             # it is a binary property with value 'Y'
10461             if (! defined $map) {
10462                 $map = 'Y';
10463             }
10464             else {
10465
10466                 # If the map begins with a special command to us (enclosed in
10467                 # delimiters), extract the command(s).
10468                 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
10469                     my $command = $1;
10470                     if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
10471                         $replace = $1;
10472                     }
10473                     elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
10474                         $map_type = $1;
10475                     }
10476                     else {
10477                         $file->carp_bad_line("Unknown command line: '$1'");
10478                         next LINE;
10479                     }
10480                 }
10481             }
10482
10483             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
10484             {
10485
10486                 # Here, we have a map to a particular code point, and the
10487                 # default map is to a code point itself.  If the range
10488                 # includes the particular code point, change that portion of
10489                 # the range to the default.  This makes sure that in the final
10490                 # table only the non-defaults are listed.
10491                 my $decimal_map = hex $map;
10492                 if ($low <= $decimal_map && $decimal_map <= $high) {
10493
10494                     # If the range includes stuff before or after the map
10495                     # we're changing, split it and process the split-off parts
10496                     # later.
10497                     if ($low < $decimal_map) {
10498                         $file->insert_adjusted_lines(
10499                                             sprintf("%04X..%04X; %s; %s",
10500                                                     $low,
10501                                                     $decimal_map - 1,
10502                                                     $property_name,
10503                                                     $map));
10504                     }
10505                     if ($high > $decimal_map) {
10506                         $file->insert_adjusted_lines(
10507                                             sprintf("%04X..%04X; %s; %s",
10508                                                     $decimal_map + 1,
10509                                                     $high,
10510                                                     $property_name,
10511                                                     $map));
10512                     }
10513                     $low = $high = $decimal_map;
10514                     $map = $CODE_POINT;
10515                 }
10516             }
10517
10518             # If we can tell that this is a synonym for the default map, use
10519             # the default one instead.
10520             if ($property_type != $STRING
10521                 && $property_type != $UNKNOWN)
10522             {
10523                 my $table = $property_object->table($map);
10524                 if (defined $table && $table == $default_table) {
10525                     $map = $default_map;
10526                 }
10527             }
10528
10529             # And figure out the map type if not known.
10530             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
10531                 if ($map eq "") {   # Nulls are always $NULL map type
10532                     $map_type = $NULL;
10533                 } # Otherwise, non-strings, and those that don't allow
10534                   # $MULTI_CP, and those that aren't multiple code points are
10535                   # 0
10536                 elsif
10537                    (($property_type != $STRING && $property_type != $UNKNOWN)
10538                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
10539                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
10540                 {
10541                     $map_type = 0;
10542                 }
10543                 else {
10544                     $map_type = $MULTI_CP;
10545                 }
10546             }
10547
10548             $property_object->add_map($low, $high,
10549                                         $map,
10550                                         Type => $map_type,
10551                                         Replace => $replace);
10552         } # End of loop through file's lines
10553
10554         return;
10555     }
10556 }
10557
10558 { # Closure for UnicodeData.txt handling
10559
10560     # This file was the first one in the UCD; its design leads to some
10561     # awkwardness in processing.  Here is a sample line:
10562     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
10563     # The fields in order are:
10564     my $i = 0;            # The code point is in field 0, and is shifted off.
10565     my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
10566     my $CATEGORY = $i++;  # category (e.g. "Lu")
10567     my $CCC = $i++;       # Canonical combining class (e.g. "230")
10568     my $BIDI = $i++;      # directional class (e.g. "L")
10569     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
10570     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
10571     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
10572                                          # Dual-use in this program; see below
10573     my $NUMERIC = $i++;   # numeric value
10574     my $MIRRORED = $i++;  # ? mirrored
10575     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
10576     my $COMMENT = $i++;   # iso comment
10577     my $UPPER = $i++;     # simple uppercase mapping
10578     my $LOWER = $i++;     # simple lowercase mapping
10579     my $TITLE = $i++;     # simple titlecase mapping
10580     my $input_field_count = $i;
10581
10582     # This routine in addition outputs these extra fields:
10583
10584     my $DECOMP_TYPE = $i++; # Decomposition type
10585
10586     # These fields are modifications of ones above, and are usually
10587     # suppressed; they must come last, as for speed, the loop upper bound is
10588     # normally set to ignore them
10589     my $NAME = $i++;        # This is the strict name field, not the one that
10590                             # charnames uses.
10591     my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
10592                             # by Unicode::Normalize
10593     my $last_field = $i - 1;
10594
10595     # All these are read into an array for each line, with the indices defined
10596     # above.  The empty fields in the example line above indicate that the
10597     # value is defaulted.  The handler called for each line of the input
10598     # changes these to their defaults.
10599
10600     # Here are the official names of the properties, in a parallel array:
10601     my @field_names;
10602     $field_names[$BIDI] = 'Bidi_Class';
10603     $field_names[$CATEGORY] = 'General_Category';
10604     $field_names[$CCC] = 'Canonical_Combining_Class';
10605     $field_names[$CHARNAME] = 'Perl_Charnames';
10606     $field_names[$COMMENT] = 'ISO_Comment';
10607     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
10608     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
10609     $field_names[$LOWER] = 'Lowercase_Mapping';
10610     $field_names[$MIRRORED] = 'Bidi_Mirrored';
10611     $field_names[$NAME] = 'Name';
10612     $field_names[$NUMERIC] = 'Numeric_Value';
10613     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
10614     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
10615     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
10616     $field_names[$TITLE] = 'Titlecase_Mapping';
10617     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
10618     $field_names[$UPPER] = 'Uppercase_Mapping';
10619
10620     # Some of these need a little more explanation:
10621     # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
10622     #   property, but is used in calculating the Numeric_Type.  Perl however,
10623     #   creates a file from this field, so a Perl property is created from it.
10624     # Similarly, the Other_Digit field is used only for calculating the
10625     #   Numeric_Type, and so it can be safely re-used as the place to store
10626     #   the value for Numeric_Type; hence it is referred to as
10627     #   $NUMERIC_TYPE_OTHER_DIGIT.
10628     # The input field named $PERL_DECOMPOSITION is a combination of both the
10629     #   decomposition mapping and its type.  Perl creates a file containing
10630     #   exactly this field, so it is used for that.  The two properties are
10631     #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
10632     #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
10633     #   output it), as Perl doesn't use it directly.
10634     # The input field named here $CHARNAME is used to construct the
10635     #   Perl_Charnames property, which is a combination of the Name property
10636     #   (which the input field contains), and the Unicode_1_Name property, and
10637     #   others from other files.  Since, the strict Name property is not used
10638     #   by Perl, this field is used for the table that Perl does use.  The
10639     #   strict Name property table is usually suppressed (unless the lists are
10640     #   changed to output it), so it is accumulated in a separate field,
10641     #   $NAME, which to save time is discarded unless the table is actually to
10642     #   be output
10643
10644     # This file is processed like most in this program.  Control is passed to
10645     # process_generic_property_file() which calls filter_UnicodeData_line()
10646     # for each input line.  This filter converts the input into line(s) that
10647     # process_generic_property_file() understands.  There is also a setup
10648     # routine called before any of the file is processed, and a handler for
10649     # EOF processing, all in this closure.
10650
10651     # A huge speed-up occurred at the cost of some added complexity when these
10652     # routines were altered to buffer the outputs into ranges.  Almost all the
10653     # lines of the input file apply to just one code point, and for most
10654     # properties, the map for the next code point up is the same as the
10655     # current one.  So instead of creating a line for each property for each
10656     # input line, filter_UnicodeData_line() remembers what the previous map
10657     # of a property was, and doesn't generate a line to pass on until it has
10658     # to, as when the map changes; and that passed-on line encompasses the
10659     # whole contiguous range of code points that have the same map for that
10660     # property.  This means a slight amount of extra setup, and having to
10661     # flush these buffers on EOF, testing if the maps have changed, plus
10662     # remembering state information in the closure.  But it means a lot less
10663     # real time in not having to change the data base for each property on
10664     # each line.
10665
10666     # Another complication is that there are already a few ranges designated
10667     # in the input.  There are two lines for each, with the same maps except
10668     # the code point and name on each line.  This was actually the hardest
10669     # thing to design around.  The code points in those ranges may actually
10670     # have real maps not given by these two lines.  These maps will either
10671     # be algorithmically determinable, or be in the extracted files furnished
10672     # with the UCD.  In the event of conflicts between these extracted files,
10673     # and this one, Unicode says that this one prevails.  But it shouldn't
10674     # prevail for conflicts that occur in these ranges.  The data from the
10675     # extracted files prevails in those cases.  So, this program is structured
10676     # so that those files are processed first, storing maps.  Then the other
10677     # files are processed, generally overwriting what the extracted files
10678     # stored.  But just the range lines in this input file are processed
10679     # without overwriting.  This is accomplished by adding a special string to
10680     # the lines output to tell process_generic_property_file() to turn off the
10681     # overwriting for just this one line.
10682     # A similar mechanism is used to tell it that the map is of a non-default
10683     # type.
10684
10685     sub setup_UnicodeData { # Called before any lines of the input are read
10686         my $file = shift;
10687         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10688
10689         # Create a new property specially located that is a combination of the
10690         # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10691         # Name_Alias properties.  (The final duplicates elements of the
10692         # first.)  A comment for it will later be constructed based on the
10693         # actual properties present and used
10694         $perl_charname = Property->new('Perl_Charnames',
10695                        Default_Map => "",
10696                        Directory => File::Spec->curdir(),
10697                        File => 'Name',
10698                        Fate => $INTERNAL_ONLY,
10699                        Perl_Extension => 1,
10700                        Range_Size_1 => \&output_perl_charnames_line,
10701                        Type => $STRING,
10702                        );
10703         $perl_charname->set_proxy_for('Name');
10704
10705         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
10706                                         Directory => File::Spec->curdir(),
10707                                         File => 'Decomposition',
10708                                         Format => $DECOMP_STRING_FORMAT,
10709                                         Fate => $INTERNAL_ONLY,
10710                                         Perl_Extension => 1,
10711                                         Default_Map => $CODE_POINT,
10712
10713                                         # normalize.pm can't cope with these
10714                                         Output_Range_Counts => 0,
10715
10716                                         # This is a specially formatted table
10717                                         # explicitly for normalize.pm, which
10718                                         # is expecting a particular format,
10719                                         # which means that mappings containing
10720                                         # multiple code points are in the main
10721                                         # body of the table
10722                                         Map_Type => $COMPUTE_NO_MULTI_CP,
10723                                         Type => $STRING,
10724                                         To_Output_Map => $INTERNAL_MAP,
10725                                         );
10726         $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
10727         $Perl_decomp->add_comment(join_lines(<<END
10728 This mapping is a combination of the Unicode 'Decomposition_Type' and
10729 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
10730 identical to the official Unicode 'Decomposition_Mapping' property except for
10731 two things:
10732  1) It omits the algorithmically determinable Hangul syllable decompositions,
10733 which normalize.pm handles algorithmically.
10734  2) It contains the decomposition type as well.  Non-canonical decompositions
10735 begin with a word in angle brackets, like <super>, which denotes the
10736 compatible decomposition type.  If the map does not begin with the <angle
10737 brackets>, the decomposition is canonical.
10738 END
10739         ));
10740
10741         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
10742                                         Default_Map => "",
10743                                         Perl_Extension => 1,
10744                                         Directory => $map_directory,
10745                                         Type => $STRING,
10746                                         To_Output_Map => $OUTPUT_ADJUSTED,
10747                                         );
10748         $Decimal_Digit->add_comment(join_lines(<<END
10749 This file gives the mapping of all code points which represent a single
10750 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
10751 points, and the mapping of each non-initial element of each range is actually
10752 not to "0", but to the offset that element has from its corresponding DIGIT 0.
10753 These code points are those that have Numeric_Type=Decimal; not special
10754 things, like subscripts nor Roman numerals.
10755 END
10756         ));
10757
10758         # These properties are not used for generating anything else, and are
10759         # usually not output.  By making them last in the list, we can just
10760         # change the high end of the loop downwards to avoid the work of
10761         # generating a table(s) that is/are just going to get thrown away.
10762         if (! property_ref('Decomposition_Mapping')->to_output_map
10763             && ! property_ref('Name')->to_output_map)
10764         {
10765             $last_field = min($NAME, $DECOMP_MAP) - 1;
10766         } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
10767             $last_field = $DECOMP_MAP;
10768         } elsif (property_ref('Name')->to_output_map) {
10769             $last_field = $NAME;
10770         }
10771         return;
10772     }
10773
10774     my $first_time = 1;                 # ? Is this the first line of the file
10775     my $in_range = 0;                   # ? Are we in one of the file's ranges
10776     my $previous_cp;                    # hex code point of previous line
10777     my $decimal_previous_cp = -1;       # And its decimal equivalent
10778     my @start;                          # For each field, the current starting
10779                                         # code point in hex for the range
10780                                         # being accumulated.
10781     my @fields;                         # The input fields;
10782     my @previous_fields;                # And those from the previous call
10783
10784     sub filter_UnicodeData_line {
10785         # Handle a single input line from UnicodeData.txt; see comments above
10786         # Conceptually this takes a single line from the file containing N
10787         # properties, and converts it into N lines with one property per line,
10788         # which is what the final handler expects.  But there are
10789         # complications due to the quirkiness of the input file, and to save
10790         # time, it accumulates ranges where the property values don't change
10791         # and only emits lines when necessary.  This is about an order of
10792         # magnitude fewer lines emitted.
10793
10794         my $file = shift;
10795         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10796
10797         # $_ contains the input line.
10798         # -1 in split means retain trailing null fields
10799         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10800
10801         #local $to_trace = 1 if main::DEBUG;
10802         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10803         if (@fields > $input_field_count) {
10804             $file->carp_bad_line('Extra fields');
10805             $_ = "";
10806             return;
10807         }
10808
10809         my $decimal_cp = hex $cp;
10810
10811         # We have to output all the buffered ranges when the next code point
10812         # is not exactly one after the previous one, which means there is a
10813         # gap in the ranges.
10814         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10815
10816         # The decomposition mapping field requires special handling.  It looks
10817         # like either:
10818         #
10819         # <compat> 0032 0020
10820         # 0041 0300
10821         #
10822         # The decomposition type is enclosed in <brackets>; if missing, it
10823         # means the type is canonical.  There are two decomposition mapping
10824         # tables: the one for use by Perl's normalize.pm has a special format
10825         # which is this field intact; the other, for general use is of
10826         # standard format.  In either case we have to find the decomposition
10827         # type.  Empty fields have None as their type, and map to the code
10828         # point itself
10829         if ($fields[$PERL_DECOMPOSITION] eq "") {
10830             $fields[$DECOMP_TYPE] = 'None';
10831             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10832         }
10833         else {
10834             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10835                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
10836             if (! defined $fields[$DECOMP_TYPE]) {
10837                 $fields[$DECOMP_TYPE] = 'Canonical';
10838                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10839             }
10840             else {
10841                 $fields[$DECOMP_MAP] = $map;
10842             }
10843         }
10844
10845         # The 3 numeric fields also require special handling.  The 2 digit
10846         # fields must be either empty or match the number field.  This means
10847         # that if it is empty, they must be as well, and the numeric type is
10848         # None, and the numeric value is 'Nan'.
10849         # The decimal digit field must be empty or match the other digit
10850         # field.  If the decimal digit field is non-empty, the code point is
10851         # a decimal digit, and the other two fields will have the same value.
10852         # If it is empty, but the other digit field is non-empty, the code
10853         # point is an 'other digit', and the number field will have the same
10854         # value as the other digit field.  If the other digit field is empty,
10855         # but the number field is non-empty, the code point is a generic
10856         # numeric type.
10857         if ($fields[$NUMERIC] eq "") {
10858             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10859                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10860             ) {
10861                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
10862             }
10863             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10864             $fields[$NUMERIC] = 'NaN';
10865         }
10866         else {
10867             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10868             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10869                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10870                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
10871                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10872             }
10873             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10874                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10875                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10876             }
10877             else {
10878                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10879
10880                 # Rationals require extra effort.
10881                 register_fraction($fields[$NUMERIC])
10882                                                 if $fields[$NUMERIC] =~ qr{/};
10883             }
10884         }
10885
10886         # For the properties that have empty fields in the file, and which
10887         # mean something different from empty, change them to that default.
10888         # Certain fields just haven't been empty so far in any Unicode
10889         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10890         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
10891         # the defaults; which are very unlikely to ever change.
10892         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10893         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10894
10895         # UAX44 says that if title is empty, it is the same as whatever upper
10896         # is,
10897         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10898
10899         # There are a few pairs of lines like:
10900         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10901         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10902         # that define ranges.  These should be processed after the fields are
10903         # adjusted above, as they may override some of them; but mostly what
10904         # is left is to possibly adjust the $CHARNAME field.  The names of all the
10905         # paired lines start with a '<', but this is also true of '<control>,
10906         # which isn't one of these special ones.
10907         if ($fields[$CHARNAME] eq '<control>') {
10908
10909             # Some code points in this file have the pseudo-name
10910             # '<control>', but the official name for such ones is the null
10911             # string.
10912             $fields[$NAME] = $fields[$CHARNAME] = "";
10913
10914             # We had better not be in between range lines.
10915             if ($in_range) {
10916                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10917                 $in_range = 0;
10918             }
10919         }
10920         elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
10921
10922             # Here is a non-range line.  We had better not be in between range
10923             # lines.
10924             if ($in_range) {
10925                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10926                 $in_range = 0;
10927             }
10928             if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10929
10930                 # These are code points whose names end in their code points,
10931                 # which means the names are algorithmically derivable from the
10932                 # code points.  To shorten the output Name file, the algorithm
10933                 # for deriving these is placed in the file instead of each
10934                 # code point, so they have map type $CP_IN_NAME
10935                 $fields[$CHARNAME] = $CMD_DELIM
10936                                  . $MAP_TYPE_CMD
10937                                  . '='
10938                                  . $CP_IN_NAME
10939                                  . $CMD_DELIM
10940                                  . $fields[$CHARNAME];
10941             }
10942             $fields[$NAME] = $fields[$CHARNAME];
10943         }
10944         elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10945             $fields[$CHARNAME] = $fields[$NAME] = $1;
10946
10947             # Here we are at the beginning of a range pair.
10948             if ($in_range) {
10949                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
10950             }
10951             $in_range = 1;
10952
10953             # Because the properties in the range do not overwrite any already
10954             # in the db, we must flush the buffers of what's already there, so
10955             # they get handled in the normal scheme.
10956             $force_output = 1;
10957
10958         }
10959         elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10960             $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
10961             $_ = "";
10962             return;
10963         }
10964         else { # Here, we are at the last line of a range pair.
10965
10966             if (! $in_range) {
10967                 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
10968                 $_ = "";
10969                 return;
10970             }
10971             $in_range = 0;
10972
10973             $fields[$NAME] = $fields[$CHARNAME];
10974
10975             # Check that the input is valid: that the closing of the range is
10976             # the same as the beginning.
10977             foreach my $i (0 .. $last_field) {
10978                 next if $fields[$i] eq $previous_fields[$i];
10979                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
10980             }
10981
10982             # The processing differs depending on the type of range,
10983             # determined by its $CHARNAME
10984             if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
10985
10986                 # Check that the data looks right.
10987                 if ($decimal_previous_cp != $SBase) {
10988                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
10989                 }
10990                 if ($decimal_cp != $SBase + $SCount - 1) {
10991                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
10992                 }
10993
10994                 # The Hangul syllable range has a somewhat complicated name
10995                 # generation algorithm.  Each code point in it has a canonical
10996                 # decomposition also computable by an algorithm.  The
10997                 # perl decomposition map table built from these is used only
10998                 # by normalize.pm, which has the algorithm built in it, so the
10999                 # decomposition maps are not needed, and are large, so are
11000                 # omitted from it.  If the full decomposition map table is to
11001                 # be output, the decompositions are generated for it, in the
11002                 # EOF handling code for this input file.
11003
11004                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
11005
11006                 # This range is stored in our internal structure with its
11007                 # own map type, different from all others.
11008                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11009                                         = $CMD_DELIM
11010                                           . $MAP_TYPE_CMD
11011                                           . '='
11012                                           . $HANGUL_SYLLABLE
11013                                           . $CMD_DELIM
11014                                           . $fields[$CHARNAME];
11015             }
11016             elsif ($fields[$CHARNAME] =~ /^CJK/) {
11017
11018                 # The name for these contains the code point itself, and all
11019                 # are defined to have the same base name, regardless of what
11020                 # is in the file.  They are stored in our internal structure
11021                 # with a map type of $CP_IN_NAME
11022                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11023                                         = $CMD_DELIM
11024                                            . $MAP_TYPE_CMD
11025                                            . '='
11026                                            . $CP_IN_NAME
11027                                            . $CMD_DELIM
11028                                            . 'CJK UNIFIED IDEOGRAPH';
11029
11030             }
11031             elsif ($fields[$CATEGORY] eq 'Co'
11032                      || $fields[$CATEGORY] eq 'Cs')
11033             {
11034                 # The names of all the code points in these ranges are set to
11035                 # null, as there are no names for the private use and
11036                 # surrogate code points.
11037
11038                 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
11039             }
11040             else {
11041                 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
11042             }
11043
11044             # The first line of the range caused everything else to be output,
11045             # and then its values were stored as the beginning values for the
11046             # next set of ranges, which this one ends.  Now, for each value,
11047             # add a command to tell the handler that these values should not
11048             # replace any existing ones in our database.
11049             foreach my $i (0 .. $last_field) {
11050                 $previous_fields[$i] = $CMD_DELIM
11051                                         . $REPLACE_CMD
11052                                         . '='
11053                                         . $NO
11054                                         . $CMD_DELIM
11055                                         . $previous_fields[$i];
11056             }
11057
11058             # And change things so it looks like the entire range has been
11059             # gone through with this being the final part of it.  Adding the
11060             # command above to each field will cause this range to be flushed
11061             # during the next iteration, as it guaranteed that the stored
11062             # field won't match whatever value the next one has.
11063             $previous_cp = $cp;
11064             $decimal_previous_cp = $decimal_cp;
11065
11066             # We are now set up for the next iteration; so skip the remaining
11067             # code in this subroutine that does the same thing, but doesn't
11068             # know about these ranges.
11069             $_ = "";
11070
11071             return;
11072         }
11073
11074         # On the very first line, we fake it so the code below thinks there is
11075         # nothing to output, and initialize so that when it does get output it
11076         # uses the first line's values for the lowest part of the range.
11077         # (One could avoid this by using peek(), but then one would need to
11078         # know the adjustments done above and do the same ones in the setup
11079         # routine; not worth it)
11080         if ($first_time) {
11081             $first_time = 0;
11082             @previous_fields = @fields;
11083             @start = ($cp) x scalar @fields;
11084             $decimal_previous_cp = $decimal_cp - 1;
11085         }
11086
11087         # For each field, output the stored up ranges that this code point
11088         # doesn't fit in.  Earlier we figured out if all ranges should be
11089         # terminated because of changing the replace or map type styles, or if
11090         # there is a gap between this new code point and the previous one, and
11091         # that is stored in $force_output.  But even if those aren't true, we
11092         # need to output the range if this new code point's value for the
11093         # given property doesn't match the stored range's.
11094         #local $to_trace = 1 if main::DEBUG;
11095         foreach my $i (0 .. $last_field) {
11096             my $field = $fields[$i];
11097             if ($force_output || $field ne $previous_fields[$i]) {
11098
11099                 # Flush the buffer of stored values.
11100                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11101
11102                 # Start a new range with this code point and its value
11103                 $start[$i] = $cp;
11104                 $previous_fields[$i] = $field;
11105             }
11106         }
11107
11108         # Set the values for the next time.
11109         $previous_cp = $cp;
11110         $decimal_previous_cp = $decimal_cp;
11111
11112         # The input line has generated whatever adjusted lines are needed, and
11113         # should not be looked at further.
11114         $_ = "";
11115         return;
11116     }
11117
11118     sub EOF_UnicodeData {
11119         # Called upon EOF to flush the buffers, and create the Hangul
11120         # decomposition mappings if needed.
11121
11122         my $file = shift;
11123         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11124
11125         # Flush the buffers.
11126         foreach my $i (0 .. $last_field) {
11127             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11128         }
11129
11130         if (-e 'Jamo.txt') {
11131
11132             # The algorithm is published by Unicode, based on values in
11133             # Jamo.txt, (which should have been processed before this
11134             # subroutine), and the results left in %Jamo
11135             unless (%Jamo) {
11136                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
11137                 return;
11138             }
11139
11140             # If the full decomposition map table is being output, insert
11141             # into it the Hangul syllable mappings.  This is to avoid having
11142             # to publish a subroutine in it to compute them.  (which would
11143             # essentially be this code.)  This uses the algorithm published by
11144             # Unicode.  (No hangul syllables in version 1)
11145             if ($v_version ge v2.0.0
11146                 && property_ref('Decomposition_Mapping')->to_output_map) {
11147                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
11148                     use integer;
11149                     my $SIndex = $S - $SBase;
11150                     my $L = $LBase + $SIndex / $NCount;
11151                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
11152                     my $T = $TBase + $SIndex % $TCount;
11153
11154                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
11155                     my $decomposition = sprintf("%04X %04X", $L, $V);
11156                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
11157                     $file->insert_adjusted_lines(
11158                                 sprintf("%04X; Decomposition_Mapping; %s",
11159                                         $S,
11160                                         $decomposition));
11161                 }
11162             }
11163         }
11164
11165         return;
11166     }
11167
11168     sub filter_v1_ucd {
11169         # Fix UCD lines in version 1.  This is probably overkill, but this
11170         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
11171         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
11172         #       removed.  This program retains them
11173         # 2)    didn't include ranges, which it should have, and which are now
11174         #       added in @corrected_lines below.  It was hand populated by
11175         #       taking the data from Version 2, verified by analyzing
11176         #       DAge.txt.
11177         # 3)    There is a syntax error in the entry for U+09F8 which could
11178         #       cause problems for utf8_heavy, and so is changed.  It's
11179         #       numeric value was simply a minus sign, without any number.
11180         #       (Eventually Unicode changed the code point to non-numeric.)
11181         # 4)    The decomposition types often don't match later versions
11182         #       exactly, and the whole syntax of that field is different; so
11183         #       the syntax is changed as well as the types to their later
11184         #       terminology.  Otherwise normalize.pm would be very unhappy
11185         # 5)    Many ccc classes are different.  These are left intact.
11186         # 6)    U+FF10..U+FF19 are missing their numeric values in all three
11187         #       fields.  These are unchanged because it doesn't really cause
11188         #       problems for Perl.
11189         # 7)    A number of code points, such as controls, don't have their
11190         #       Unicode Version 1 Names in this file.  These are added.
11191         # 8)    A number of Symbols were marked as Lm.  This changes those in
11192         #       the Latin1 range, so that regexes work.
11193         # 9)    The odd characters U+03DB .. U+03E1 weren't encoded but are
11194         #       referred to by their lc equivalents.  Not fixed.
11195
11196         my @corrected_lines = split /\n/, <<'END';
11197 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
11198 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11199 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
11200 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
11201 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
11202 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11203 END
11204
11205         my $file = shift;
11206         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11207
11208         #local $to_trace = 1 if main::DEBUG;
11209         trace $_ if main::DEBUG && $to_trace;
11210
11211         # -1 => retain trailing null fields
11212         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11213
11214         # At the first place that is wrong in the input, insert all the
11215         # corrections, replacing the wrong line.
11216         if ($code_point eq '4E00') {
11217             my @copy = @corrected_lines;
11218             $_ = shift @copy;
11219             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11220
11221             $file->insert_lines(@copy);
11222         }
11223         elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
11224
11225             # There are no Lm characters in Latin1; these should be 'Sk', but
11226             # there isn't that in V1.
11227             $fields[$CATEGORY] = 'So';
11228         }
11229
11230         if ($fields[$NUMERIC] eq '-') {
11231             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
11232         }
11233
11234         if  ($fields[$PERL_DECOMPOSITION] ne "") {
11235
11236             # Several entries have this change to superscript 2 or 3 in the
11237             # middle.  Convert these to the modern version, which is to use
11238             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
11239             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
11240             # 'HHHH HHHH 00B3 HHHH'.
11241             # It turns out that all of these that don't have another
11242             # decomposition defined at the beginning of the line have the
11243             # <square> decomposition in later releases.
11244             if ($code_point ne '00B2' && $code_point ne '00B3') {
11245                 if  ($fields[$PERL_DECOMPOSITION]
11246                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
11247                 {
11248                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
11249                         $fields[$PERL_DECOMPOSITION] = '<square> '
11250                         . $fields[$PERL_DECOMPOSITION];
11251                     }
11252                 }
11253             }
11254
11255             # If is like '<+circled> 0052 <-circled>', convert to
11256             # '<circled> 0052'
11257             $fields[$PERL_DECOMPOSITION] =~
11258                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
11259
11260             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
11261             $fields[$PERL_DECOMPOSITION] =~
11262                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
11263             or $fields[$PERL_DECOMPOSITION] =~
11264                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
11265             or $fields[$PERL_DECOMPOSITION] =~
11266                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
11267             or $fields[$PERL_DECOMPOSITION] =~
11268                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
11269
11270             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
11271             $fields[$PERL_DECOMPOSITION] =~
11272                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
11273
11274             # Change names to modern form.
11275             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
11276             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
11277             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
11278             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
11279
11280             # One entry has weird braces
11281             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
11282
11283             # One entry at U+2116 has an extra <sup>
11284             $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
11285         }
11286
11287         $_ = join ';', $code_point, @fields;
11288         trace $_ if main::DEBUG && $to_trace;
11289         return;
11290     }
11291
11292     sub filter_bad_Nd_ucd {
11293         # Early versions specified a value in the decimal digit field even
11294         # though the code point wasn't a decimal digit.  Clear the field in
11295         # that situation, so that the main code doesn't think it is a decimal
11296         # digit.
11297
11298         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11299         if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
11300             $fields[$PERL_DECIMAL_DIGIT] = "";
11301             $_ = join ';', $code_point, @fields;
11302         }
11303         return;
11304     }
11305
11306     my @U1_control_names = split /\n/, <<'END';
11307 NULL
11308 START OF HEADING
11309 START OF TEXT
11310 END OF TEXT
11311 END OF TRANSMISSION
11312 ENQUIRY
11313 ACKNOWLEDGE
11314 BELL
11315 BACKSPACE
11316 HORIZONTAL TABULATION
11317 LINE FEED
11318 VERTICAL TABULATION
11319 FORM FEED
11320 CARRIAGE RETURN
11321 SHIFT OUT
11322 SHIFT IN
11323 DATA LINK ESCAPE
11324 DEVICE CONTROL ONE
11325 DEVICE CONTROL TWO
11326 DEVICE CONTROL THREE
11327 DEVICE CONTROL FOUR
11328 NEGATIVE ACKNOWLEDGE
11329 SYNCHRONOUS IDLE
11330 END OF TRANSMISSION BLOCK
11331 CANCEL
11332 END OF MEDIUM
11333 SUBSTITUTE
11334 ESCAPE
11335 FILE SEPARATOR
11336 GROUP SEPARATOR
11337 RECORD SEPARATOR
11338 UNIT SEPARATOR
11339 DELETE
11340 BREAK PERMITTED HERE
11341 NO BREAK HERE
11342 INDEX
11343 NEXT LINE
11344 START OF SELECTED AREA
11345 END OF SELECTED AREA
11346 CHARACTER TABULATION SET
11347 CHARACTER TABULATION WITH JUSTIFICATION
11348 LINE TABULATION SET
11349 PARTIAL LINE DOWN
11350 PARTIAL LINE UP
11351 REVERSE LINE FEED
11352 SINGLE SHIFT TWO
11353 SINGLE SHIFT THREE
11354 DEVICE CONTROL STRING
11355 PRIVATE USE ONE
11356 PRIVATE USE TWO
11357 SET TRANSMIT STATE
11358 CANCEL CHARACTER
11359 MESSAGE WAITING
11360 START OF GUARDED AREA
11361 END OF GUARDED AREA
11362 START OF STRING
11363 SINGLE CHARACTER INTRODUCER
11364 CONTROL SEQUENCE INTRODUCER
11365 STRING TERMINATOR
11366 OPERATING SYSTEM COMMAND
11367 PRIVACY MESSAGE
11368 APPLICATION PROGRAM COMMAND
11369 END
11370
11371     sub filter_early_U1_names {
11372         # Very early versions did not have the Unicode_1_name field specified.
11373         # They differed in which ones were present; make sure a U1 name
11374         # exists, so that Unicode::UCD::charinfo will work
11375
11376         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11377
11378
11379         # @U1_control names above are entirely positional, so we pull them out
11380         # in the exact order required, with gaps for the ones that don't have
11381         # names.
11382         if ($code_point =~ /^00[01]/
11383             || $code_point eq '007F'
11384             || $code_point =~ /^008[2-9A-F]/
11385             || $code_point =~ /^009[0-8A-F]/)
11386         {
11387             my $u1_name = shift @U1_control_names;
11388             $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
11389             $_ = join ';', $code_point, @fields;
11390         }
11391         return;
11392     }
11393
11394     sub filter_v2_1_5_ucd {
11395         # A dozen entries in this 2.1.5 file had the mirrored and numeric
11396         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
11397         # column appears to be N, swap it back.
11398
11399         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11400         if ($fields[$NUMERIC] eq 'N') {
11401             $fields[$NUMERIC] = $fields[$MIRRORED];
11402             $fields[$MIRRORED] = 'N';
11403             $_ = join ';', $code_point, @fields;
11404         }
11405         return;
11406     }
11407
11408     sub filter_v6_ucd {
11409
11410         # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
11411         # it wasn't accepted, to allow for some deprecation cycles.  This
11412         # function is not called after 5.16
11413
11414         return if $_ !~ /^(?:0007|1F514|070F);/;
11415
11416         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11417         if ($code_point eq '0007') {
11418             $fields[$CHARNAME] = "";
11419         }
11420         elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
11421                             # http://www.unicode.org/versions/corrigendum8.html
11422             $fields[$BIDI] = "AL";
11423         }
11424         elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
11425             $fields[$CHARNAME] = "";
11426         }
11427
11428         $_ = join ';', $code_point, @fields;
11429
11430         return;
11431     }
11432 } # End closure for UnicodeData
11433
11434 sub process_GCB_test {
11435
11436     my $file = shift;
11437     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11438
11439     while ($file->next_line) {
11440         push @backslash_X_tests, $_;
11441     }
11442
11443     return;
11444 }
11445
11446 sub process_NamedSequences {
11447     # NamedSequences.txt entries are just added to an array.  Because these
11448     # don't look like the other tables, they have their own handler.
11449     # An example:
11450     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
11451     #
11452     # This just adds the sequence to an array for later handling
11453
11454     my $file = shift;
11455     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11456
11457     while ($file->next_line) {
11458         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
11459         if (@remainder) {
11460             $file->carp_bad_line(
11461                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
11462             next;
11463         }
11464
11465         # Note single \t in keeping with special output format of
11466         # Perl_charnames.  But it turns out that the code points don't have to
11467         # be 5 digits long, like the rest, based on the internal workings of
11468         # charnames.pm.  This could be easily changed for consistency.
11469         push @named_sequences, "$sequence\t$name";
11470     }
11471     return;
11472 }
11473
11474 { # Closure
11475
11476     my $first_range;
11477
11478     sub  filter_early_ea_lb {
11479         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
11480         # third field be the name of the code point, which can be ignored in
11481         # most cases.  But it can be meaningful if it marks a range:
11482         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
11483         # 3400;W;<CJK Ideograph Extension A, First>
11484         #
11485         # We need to see the First in the example above to know it's a range.
11486         # They did not use the later range syntaxes.  This routine changes it
11487         # to use the modern syntax.
11488         # $1 is the Input_file object.
11489
11490         my @fields = split /\s*;\s*/;
11491         if ($fields[2] =~ /^<.*, First>/) {
11492             $first_range = $fields[0];
11493             $_ = "";
11494         }
11495         elsif ($fields[2] =~ /^<.*, Last>/) {
11496             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
11497         }
11498         else {
11499             undef $first_range;
11500             $_ = "$fields[0]; $fields[1]";
11501         }
11502
11503         return;
11504     }
11505 }
11506
11507 sub filter_old_style_arabic_shaping {
11508     # Early versions used a different term for the later one.
11509
11510     my @fields = split /\s*;\s*/;
11511     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
11512     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
11513     $_ = join ';', @fields;
11514     return;
11515 }
11516
11517 { # Closure
11518     my $lc; # Table for lowercase mapping
11519     my $tc;
11520     my $uc;
11521     my %special_casing_code_points;
11522
11523     sub setup_special_casing {
11524         # SpecialCasing.txt contains the non-simple case change mappings.  The
11525         # simple ones are in UnicodeData.txt, which should already have been
11526         # read in to the full property data structures, so as to initialize
11527         # these with the simple ones.  Then the SpecialCasing.txt entries
11528         # add or overwrite the ones which have different full mappings.
11529
11530         # This routine sees if the simple mappings are to be output, and if
11531         # so, copies what has already been put into the full mapping tables,
11532         # while they still contain only the simple mappings.
11533
11534         # The reason it is done this way is that the simple mappings are
11535         # probably not going to be output, so it saves work to initialize the
11536         # full tables with the simple mappings, and then overwrite those
11537         # relatively few entries in them that have different full mappings,
11538         # and thus skip the simple mapping tables altogether.
11539
11540         my $file= shift;
11541         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11542
11543         $lc = property_ref('lc');
11544         $tc = property_ref('tc');
11545         $uc = property_ref('uc');
11546
11547         # For each of the case change mappings...
11548         foreach my $full_casing_table ($lc, $tc, $uc) {
11549             my $full_casing_name = $full_casing_table->name;
11550             my $full_casing_full_name = $full_casing_table->full_name;
11551             unless (defined $full_casing_table
11552                     && ! $full_casing_table->is_empty)
11553             {
11554                 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
11555             }
11556
11557             # Create a table in the old-style format and with the original
11558             # file name for backwards compatibility with applications that
11559             # read it directly.  The new tables contain both the simple and
11560             # full maps, and the old are missing simple maps when there is a
11561             # conflicting full one.  Probably it would have been ok to add
11562             # those to the legacy version, as was already done in 5.14 to the
11563             # case folding one, but this was not done, out of an abundance of
11564             # caution.  The tables are set up here before we deal with the
11565             # full maps so that as we handle those, we can override the simple
11566             # maps for them in the legacy table, and merely add them in the
11567             # new-style one.
11568             my $legacy = Property->new("Legacy_" . $full_casing_full_name,
11569                                 File => $full_casing_full_name
11570                                                           =~ s/case_Mapping//r,
11571                                 Format => $HEX_FORMAT,
11572                                 Default_Map => $CODE_POINT,
11573                                 Initialize => $full_casing_table,
11574                                 Replacement_Property => $full_casing_full_name,
11575             );
11576
11577             $full_casing_table->add_comment(join_lines( <<END
11578 This file includes both the simple and full case changing maps.  The simple
11579 ones are in the main body of the table below, and the full ones adding to or
11580 overriding them are in the hash.
11581 END
11582             ));
11583
11584             # The simple version's name in each mapping merely has an 's' in
11585             # front of the full one's
11586             my $simple_name = 's' . $full_casing_name;
11587             my $simple = property_ref($simple_name);
11588             $simple->initialize($full_casing_table) if $simple->to_output_map();
11589         }
11590
11591         return;
11592     }
11593
11594     sub filter_2_1_8_special_casing_line {
11595
11596         # This version had duplicate entries in this file.  Delete all but the
11597         # first one
11598         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11599                                               # fields
11600         if (exists $special_casing_code_points{$fields[0]}) {
11601             $_ = "";
11602             return;
11603         }
11604
11605         $special_casing_code_points{$fields[0]} = 1;
11606         filter_special_casing_line(@_);
11607     }
11608
11609     sub filter_special_casing_line {
11610         # Change the format of $_ from SpecialCasing.txt into something that
11611         # the generic handler understands.  Each input line contains three
11612         # case mappings.  This will generate three lines to pass to the
11613         # generic handler for each of those.
11614
11615         # The input syntax (after stripping comments and trailing white space
11616         # is like one of the following (with the final two being entries that
11617         # we ignore):
11618         # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
11619         # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
11620         # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
11621         # Note the trailing semi-colon, unlike many of the input files.  That
11622         # means that there will be an extra null field generated by the split
11623
11624         my $file = shift;
11625         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11626
11627         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11628                                               # fields
11629
11630         # field #4 is when this mapping is conditional.  If any of these get
11631         # implemented, it would be by hard-coding in the casing functions in
11632         # the Perl core, not through tables.  But if there is a new condition
11633         # we don't know about, output a warning.  We know about all the
11634         # conditions through 6.0
11635         if ($fields[4] ne "") {
11636             my @conditions = split ' ', $fields[4];
11637             if ($conditions[0] ne 'tr'  # We know that these languages have
11638                                         # conditions, and some are multiple
11639                 && $conditions[0] ne 'az'
11640                 && $conditions[0] ne 'lt'
11641
11642                 # And, we know about a single condition Final_Sigma, but
11643                 # nothing else.
11644                 && ($v_version gt v5.2.0
11645                     && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
11646             {
11647                 $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
11648             }
11649             elsif ($conditions[0] ne 'Final_Sigma') {
11650
11651                     # Don't print out a message for Final_Sigma, because we
11652                     # have hard-coded handling for it.  (But the standard
11653                     # could change what the rule should be, but it wouldn't
11654                     # show up here anyway.
11655
11656                     print "# SKIPPING Special Casing: $_\n"
11657                                                     if $verbosity >= $VERBOSE;
11658             }
11659             $_ = "";
11660             return;
11661         }
11662         elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
11663             $file->carp_bad_line('Extra fields');
11664             $_ = "";
11665             return;
11666         }
11667
11668         my $decimal_code_point = hex $fields[0];
11669
11670         # Loop to handle each of the three mappings in the input line, in
11671         # order, with $i indicating the current field number.
11672         my $i = 0;
11673         for my $object ($lc, $tc, $uc) {
11674             $i++;   # First time through, $i = 0 ... 3rd time = 3
11675
11676             my $value = $object->value_of($decimal_code_point);
11677             $value = ($value eq $CODE_POINT)
11678                       ? $decimal_code_point
11679                       : hex $value;
11680
11681             # If this isn't a multi-character mapping, it should already have
11682             # been read in.
11683             if ($fields[$i] !~ / /) {
11684                 if ($value != hex $fields[$i]) {
11685                     Carp::my_carp("Bad news. UnicodeData.txt thinks "
11686                                   . $object->name
11687                                   . "(0x$fields[0]) is $value"
11688                                   . " and SpecialCasing.txt thinks it is "
11689                                   . hex($fields[$i])
11690                                   . ".  Good luck.  Retaining UnicodeData value, and proceeding anyway.");
11691                 }
11692             }
11693             else {
11694
11695                 # The mapping goes into both the legacy table, in which it
11696                 # replaces the simple one...
11697                 $file->insert_adjusted_lines("$fields[0]; Legacy_"
11698                                              . $object->full_name
11699                                              . "; $fields[$i]");
11700
11701                 # ... and the regular table, in which it is additional,
11702                 # beyond the simple mapping.
11703                 $file->insert_adjusted_lines("$fields[0]; "
11704                                              . $object->name
11705                                             . "; "
11706                                             . $CMD_DELIM
11707                                             . "$REPLACE_CMD=$MULTIPLE_BEFORE"
11708                                             . $CMD_DELIM
11709                                             . $fields[$i]);
11710             }
11711         }
11712
11713         # Everything has been handled by the insert_adjusted_lines()
11714         $_ = "";
11715
11716         return;
11717     }
11718 }
11719
11720 sub filter_old_style_case_folding {
11721     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
11722     # and later style.  Different letters were used in the earlier.
11723
11724     my $file = shift;
11725     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11726
11727     my @fields = split /\s*;\s*/;
11728     if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
11729         $fields[1] = 'I';
11730     }
11731     elsif ($fields[1] eq 'L') {
11732         $fields[1] = 'C';             # L => C always
11733     }
11734     elsif ($fields[1] eq 'E') {
11735         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
11736             $fields[1] = 'F'
11737         }
11738         else {
11739             $fields[1] = 'C'
11740         }
11741     }
11742     else {
11743         $file->carp_bad_line("Expecting L or E in second field");
11744         $_ = "";
11745         return;
11746     }
11747     $_ = join("; ", @fields) . ';';
11748     return;
11749 }
11750
11751 { # Closure for case folding
11752
11753     # Create the map for simple only if are going to output it, for otherwise
11754     # it takes no part in anything we do.
11755     my $to_output_simple;
11756
11757     sub setup_case_folding($) {
11758         # Read in the case foldings in CaseFolding.txt.  This handles both
11759         # simple and full case folding.
11760
11761         $to_output_simple
11762                         = property_ref('Simple_Case_Folding')->to_output_map;
11763
11764         if (! $to_output_simple) {
11765             property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
11766         }
11767
11768         # If we ever wanted to show that these tables were combined, a new
11769         # property method could be created, like set_combined_props()
11770         property_ref('Case_Folding')->add_comment(join_lines( <<END
11771 This file includes both the simple and full case folding maps.  The simple
11772 ones are in the main body of the table below, and the full ones adding to or
11773 overriding them are in the hash.
11774 END
11775         ));
11776         return;
11777     }
11778
11779     sub filter_case_folding_line {
11780         # Called for each line in CaseFolding.txt
11781         # Input lines look like:
11782         # 0041; C; 0061; # LATIN CAPITAL LETTER A
11783         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
11784         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
11785         #
11786         # 'C' means that folding is the same for both simple and full
11787         # 'F' that it is only for full folding
11788         # 'S' that it is only for simple folding
11789         # 'T' is locale-dependent, and ignored
11790         # 'I' is a type of 'F' used in some early releases.
11791         # Note the trailing semi-colon, unlike many of the input files.  That
11792         # means that there will be an extra null field generated by the split
11793         # below, which we ignore and hence is not an error.
11794
11795         my $file = shift;
11796         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11797
11798         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
11799         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
11800             $file->carp_bad_line('Extra fields');
11801             $_ = "";
11802             return;
11803         }
11804
11805         if ($type =~ / ^ [IT] $/x) {   # Skip Turkic case folding, is locale dependent
11806             $_ = "";
11807             return;
11808         }
11809
11810         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
11811         # I are all full foldings; S is single-char.  For S, there is always
11812         # an F entry, so we must allow multiple values for the same code
11813         # point.  Fortunately this table doesn't need further manipulation
11814         # which would preclude using multiple-values.  The S is now included
11815         # so that _swash_inversion_hash() is able to construct closures
11816         # without having to worry about F mappings.
11817         if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
11818             $_ = "$range; Case_Folding; "
11819                  . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
11820         }
11821         else {
11822             $_ = "";
11823             $file->carp_bad_line('Expecting C F I S or T in second field');
11824         }
11825
11826         # C and S are simple foldings, but simple case folding is not needed
11827         # unless we explicitly want its map table output.
11828         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
11829             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
11830         }
11831
11832         return;
11833     }
11834
11835 } # End case fold closure
11836
11837 sub filter_jamo_line {
11838     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
11839     # from this file that is used in generating the Name property for Jamo
11840     # code points.  But, it also is used to convert early versions' syntax
11841     # into the modern form.  Here are two examples:
11842     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
11843     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
11844     #
11845     # The input is $_, the output is $_ filtered.
11846
11847     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
11848
11849     # Let the caller handle unexpected input.  In earlier versions, there was
11850     # a third field which is supposed to be a comment, but did not have a '#'
11851     # before it.
11852     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
11853
11854     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
11855                                 # beginning.
11856
11857     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
11858     $fields[1] = 'R' if $fields[0] eq '1105';
11859
11860     # Add to structure so can generate Names from it.
11861     my $cp = hex $fields[0];
11862     my $short_name = $fields[1];
11863     $Jamo{$cp} = $short_name;
11864     if ($cp <= $LBase + $LCount) {
11865         $Jamo_L{$short_name} = $cp - $LBase;
11866     }
11867     elsif ($cp <= $VBase + $VCount) {
11868         $Jamo_V{$short_name} = $cp - $VBase;
11869     }
11870     elsif ($cp <= $TBase + $TCount) {
11871         $Jamo_T{$short_name} = $cp - $TBase;
11872     }
11873     else {
11874         Carp::my_carp_bug("Unexpected Jamo code point in $_");
11875     }
11876
11877
11878     # Reassemble using just the first two fields to look like a typical
11879     # property file line
11880     $_ = "$fields[0]; $fields[1]";
11881
11882     return;
11883 }
11884
11885 sub register_fraction($) {
11886     # This registers the input rational number so that it can be passed on to
11887     # utf8_heavy.pl, both in rational and floating forms.
11888
11889     my $rational = shift;
11890
11891     my $float = eval $rational;
11892     $nv_floating_to_rational{$float} = $rational;
11893     return;
11894 }
11895
11896 sub filter_numeric_value_line {
11897     # DNumValues contains lines of a different syntax than the typical
11898     # property file:
11899     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
11900     #
11901     # This routine transforms $_ containing the anomalous syntax to the
11902     # typical, by filtering out the extra columns, and convert early version
11903     # decimal numbers to strings that look like rational numbers.
11904
11905     my $file = shift;
11906     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11907
11908     # Starting in 5.1, there is a rational field.  Just use that, omitting the
11909     # extra columns.  Otherwise convert the decimal number in the second field
11910     # to a rational, and omit extraneous columns.
11911     my @fields = split /\s*;\s*/, $_, -1;
11912     my $rational;
11913
11914     if ($v_version ge v5.1.0) {
11915         if (@fields != 4) {
11916             $file->carp_bad_line('Not 4 semi-colon separated fields');
11917             $_ = "";
11918             return;
11919         }
11920         $rational = $fields[3];
11921         $_ = join '; ', @fields[ 0, 3 ];
11922     }
11923     else {
11924
11925         # Here, is an older Unicode file, which has decimal numbers instead of
11926         # rationals in it.  Use the fraction to calculate the denominator and
11927         # convert to rational.
11928
11929         if (@fields != 2 && @fields != 3) {
11930             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
11931             $_ = "";
11932             return;
11933         }
11934
11935         my $codepoints = $fields[0];
11936         my $decimal = $fields[1];
11937         if ($decimal =~ s/\.0+$//) {
11938
11939             # Anything ending with a decimal followed by nothing but 0's is an
11940             # integer
11941             $_ = "$codepoints; $decimal";
11942             $rational = $decimal;
11943         }
11944         else {
11945
11946             my $denominator;
11947             if ($decimal =~ /\.50*$/) {
11948                 $denominator = 2;
11949             }
11950
11951             # Here have the hardcoded repeating decimals in the fraction, and
11952             # the denominator they imply.  There were only a few denominators
11953             # in the older Unicode versions of this file which this code
11954             # handles, so it is easy to convert them.
11955
11956             # The 4 is because of a round-off error in the Unicode 3.2 files
11957             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11958                 $denominator = 3;
11959             }
11960             elsif ($decimal =~ /\.[27]50*$/) {
11961                 $denominator = 4;
11962             }
11963             elsif ($decimal =~ /\.[2468]0*$/) {
11964                 $denominator = 5;
11965             }
11966             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11967                 $denominator = 6;
11968             }
11969             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11970                 $denominator = 8;
11971             }
11972             if ($denominator) {
11973                 my $sign = ($decimal < 0) ? "-" : "";
11974                 my $numerator = int((abs($decimal) * $denominator) + .5);
11975                 $rational = "$sign$numerator/$denominator";
11976                 $_ = "$codepoints; $rational";
11977             }
11978             else {
11979                 $file->carp_bad_line("Can't cope with number '$decimal'.");
11980                 $_ = "";
11981                 return;
11982             }
11983         }
11984     }
11985
11986     register_fraction($rational) if $rational =~ qr{/};
11987     return;
11988 }
11989
11990 { # Closure
11991     my %unihan_properties;
11992
11993     sub setup_unihan {
11994         # Do any special setup for Unihan properties.
11995
11996         # This property gives the wrong computed type, so override.
11997         my $usource = property_ref('kIRG_USource');
11998         $usource->set_type($STRING) if defined $usource;
11999
12000         # This property is to be considered binary (it says so in
12001         # http://www.unicode.org/reports/tr38/)
12002         my $iicore = property_ref('kIICore');
12003         if (defined $iicore) {
12004             $iicore->set_type($FORCED_BINARY);
12005             $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
12006
12007             # Unicode doesn't include the maps for this property, so don't
12008             # warn that they are missing.
12009             $iicore->set_pre_declared_maps(0);
12010             $iicore->add_comment(join_lines( <<END
12011 This property contains enum values, but Unicode UAX #38 says it should be
12012 interpreted as binary, so Perl creates tables for both 1) its enum values,
12013 plus 2) true/false tables in which it is considered true for all code points
12014 that have a non-null value
12015 END
12016             ));
12017         }
12018
12019         return;
12020     }
12021
12022     sub filter_unihan_line {
12023         # Change unihan db lines to look like the others in the db.  Here is
12024         # an input sample:
12025         #   U+341C        kCangjie        IEKN
12026
12027         # Tabs are used instead of semi-colons to separate fields; therefore
12028         # they may have semi-colons embedded in them.  Change these to periods
12029         # so won't screw up the rest of the code.
12030         s/;/./g;
12031
12032         # Remove lines that don't look like ones we accept.
12033         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
12034             $_ = "";
12035             return;
12036         }
12037
12038         # Extract the property, and save a reference to its object.
12039         my $property = $1;
12040         if (! exists $unihan_properties{$property}) {
12041             $unihan_properties{$property} = property_ref($property);
12042         }
12043
12044         # Don't do anything unless the property is one we're handling, which
12045         # we determine by seeing if there is an object defined for it or not
12046         if (! defined $unihan_properties{$property}) {
12047             $_ = "";
12048             return;
12049         }
12050
12051         # Convert the tab separators to our standard semi-colons, and convert
12052         # the U+HHHH notation to the rest of the standard's HHHH
12053         s/\t/;/g;
12054         s/\b U \+ (?= $code_point_re )//xg;
12055
12056         #local $to_trace = 1 if main::DEBUG;
12057         trace $_ if main::DEBUG && $to_trace;
12058
12059         return;
12060     }
12061 }
12062
12063 sub filter_blocks_lines {
12064     # In the Blocks.txt file, the names of the blocks don't quite match the
12065     # names given in PropertyValueAliases.txt, so this changes them so they
12066     # do match:  Blanks and hyphens are changed into underscores.  Also makes
12067     # early release versions look like later ones
12068     #
12069     # $_ is transformed to the correct value.
12070
12071     my $file = shift;
12072         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12073
12074     if ($v_version lt v3.2.0) {
12075         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
12076             $_ = "";
12077             return;
12078         }
12079
12080         # Old versions used a different syntax to mark the range.
12081         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
12082     }
12083
12084     my @fields = split /\s*;\s*/, $_, -1;
12085     if (@fields != 2) {
12086         $file->carp_bad_line("Expecting exactly two fields");
12087         $_ = "";
12088         return;
12089     }
12090
12091     # Change hyphens and blanks in the block name field only
12092     $fields[1] =~ s/[ -]/_/g;
12093     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
12094
12095     $_ = join("; ", @fields);
12096     return;
12097 }
12098
12099 { # Closure
12100     my $current_property;
12101
12102     sub filter_old_style_proplist {
12103         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
12104         # was in a completely different syntax.  Ken Whistler of Unicode says
12105         # that it was something he used as an aid for his own purposes, but
12106         # was never an official part of the standard.  Many of the properties
12107         # in it were incorporated into the later PropList.txt, but some were
12108         # not.  This program uses this early file to generate property tables
12109         # that are otherwise not accessible in the early UCD's.  It does this
12110         # for the ones that eventually became official, and don't appear to be
12111         # too different in their contents from the later official version, and
12112         # throws away the rest.  It could be argued that the ones it generates
12113         # were probably not really official at that time, so should be
12114         # ignored.  You can easily modify things to skip all of them by
12115         # changing this function to just set $_ to "", and return; and to skip
12116         # certain of them by by simply removing their declarations from
12117         # get_old_property_aliases().
12118         #
12119         # Here is a list of all the ones that are thrown away:
12120         #   Alphabetic                   The definitions for this are very
12121         #                                defective, so better to not mislead
12122         #                                people into thinking it works.
12123         #                                Instead the Perl extension of the
12124         #                                same name is constructed from first
12125         #                                principles.
12126         #   Bidi=*                       duplicates UnicodeData.txt
12127         #   Combining                    never made into official property;
12128         #                                is \P{ccc=0}
12129         #   Composite                    never made into official property.
12130         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
12131         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
12132         #   Delimiter                    never made into official property;
12133         #                                removed in 3.0.1
12134         #   Format Control               never made into official property;
12135         #                                similar to gc=cf
12136         #   High Surrogate               duplicates Blocks.txt
12137         #   Ignorable Control            never made into official property;
12138         #                                similar to di=y
12139         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
12140         #   Left of Pair                 never made into official property;
12141         #   Line Separator               duplicates UnicodeData.txt: gc=zl
12142         #   Low Surrogate                duplicates Blocks.txt
12143         #   Non-break                    was actually listed as a property
12144         #                                in 3.2, but without any code
12145         #                                points.  Unicode denies that this
12146         #                                was ever an official property
12147         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
12148         #   Numeric                      duplicates UnicodeData.txt: gc=cc
12149         #   Paired Punctuation           never made into official property;
12150         #                                appears to be gc=ps + gc=pe
12151         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
12152         #   Private Use                  duplicates UnicodeData.txt: gc=co
12153         #   Private Use High Surrogate   duplicates Blocks.txt
12154         #   Punctuation                  duplicates UnicodeData.txt: gc=p
12155         #   Space                        different definition than eventual
12156         #                                one.
12157         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
12158         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cn
12159         #   Zero-width                   never made into official property;
12160         #                                subset of gc=cf
12161         # Most of the properties have the same names in this file as in later
12162         # versions, but a couple do not.
12163         #
12164         # This subroutine filters $_, converting it from the old style into
12165         # the new style.  Here's a sample of the old-style
12166         #
12167         #   *******************************************
12168         #
12169         #   Property dump for: 0x100000A0 (Join Control)
12170         #
12171         #   200C..200D  (2 chars)
12172         #
12173         # In the example, the property is "Join Control".  It is kept in this
12174         # closure between calls to the subroutine.  The numbers beginning with
12175         # 0x were internal to Ken's program that generated this file.
12176
12177         # If this line contains the property name, extract it.
12178         if (/^Property dump for: [^(]*\((.*)\)/) {
12179             $_ = $1;
12180
12181             # Convert white space to underscores.
12182             s/ /_/g;
12183
12184             # Convert the few properties that don't have the same name as
12185             # their modern counterparts
12186             s/Identifier_Part/ID_Continue/
12187             or s/Not_a_Character/NChar/;
12188
12189             # If the name matches an existing property, use it.
12190             if (defined property_ref($_)) {
12191                 trace "new property=", $_ if main::DEBUG && $to_trace;
12192                 $current_property = $_;
12193             }
12194             else {        # Otherwise discard it
12195                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
12196                 undef $current_property;
12197             }
12198             $_ = "";    # The property is saved for the next lines of the
12199                         # file, but this defining line is of no further use,
12200                         # so clear it so that the caller won't process it
12201                         # further.
12202         }
12203         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
12204
12205             # Here, the input line isn't a header defining a property for the
12206             # following section, and either we aren't in such a section, or
12207             # the line doesn't look like one that defines the code points in
12208             # such a section.  Ignore this line.
12209             $_ = "";
12210         }
12211         else {
12212
12213             # Here, we have a line defining the code points for the current
12214             # stashed property.  Anything starting with the first blank is
12215             # extraneous.  Otherwise, it should look like a normal range to
12216             # the caller.  Append the property name so that it looks just like
12217             # a modern PropList entry.
12218
12219             $_ =~ s/\s.*//;
12220             $_ .= "; $current_property";
12221         }
12222         trace $_ if main::DEBUG && $to_trace;
12223         return;
12224     }
12225 } # End closure for old style proplist
12226
12227 sub filter_old_style_normalization_lines {
12228     # For early releases of Unicode, the lines were like:
12229     #        74..2A76    ; NFKD_NO
12230     # For later releases this became:
12231     #        74..2A76    ; NFKD_QC; N
12232     # Filter $_ to look like those in later releases.
12233     # Similarly for MAYBEs
12234
12235     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
12236
12237     # Also, the property FC_NFKC was abbreviated to FNC
12238     s/FNC/FC_NFKC/;
12239     return;
12240 }
12241
12242 sub setup_script_extensions {
12243     # The Script_Extensions property starts out with a clone of the Script
12244     # property.
12245
12246     my $scx = property_ref("Script_Extensions");
12247     $scx = Property->new("scx", Full_Name => "Script_Extensions")
12248                                                             if ! defined $scx;
12249     $scx->_set_format($STRING_WHITE_SPACE_LIST);
12250     $scx->initialize($script);
12251     $scx->set_default_map($script->default_map);
12252     $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
12253     $scx->add_comment(join_lines( <<END
12254 The values for code points that appear in one script are just the same as for
12255 the 'Script' property.  Likewise the values for those that appear in many
12256 scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
12257 values of code points that appear in a few scripts are a space separated list
12258 of those scripts.
12259 END
12260     ));
12261
12262     # Initialize scx's tables and the aliases for them to be the same as sc's
12263     foreach my $table ($script->tables) {
12264         my $scx_table = $scx->add_match_table($table->name,
12265                                 Full_Name => $table->full_name);
12266         foreach my $alias ($table->aliases) {
12267             $scx_table->add_alias($alias->name);
12268         }
12269     }
12270 }
12271
12272 sub  filter_script_extensions_line {
12273     # The Scripts file comes with the full name for the scripts; the
12274     # ScriptExtensions, with the short name.  The final mapping file is a
12275     # combination of these, and without adjustment, would have inconsistent
12276     # entries.  This filters the latter file to convert to full names.
12277     # Entries look like this:
12278     # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
12279
12280     my @fields = split /\s*;\s*/;
12281
12282     # This script was erroneously omitted in this Unicode version.
12283     $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
12284
12285     my @full_names;
12286     foreach my $short_name (split " ", $fields[1]) {
12287         push @full_names, $script->table($short_name)->full_name;
12288     }
12289     $fields[1] = join " ", @full_names;
12290     $_ = join "; ", @fields;
12291
12292     return;
12293 }
12294
12295 sub generate_hst {
12296
12297     # Populates the Hangul Syllable Type property from first principles
12298
12299     my $file= shift;
12300     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12301
12302     # These few ranges are hard-coded in.
12303     $file->insert_lines(split /\n/, <<'END'
12304 1100..1159    ; L
12305 115F          ; L
12306 1160..11A2    ; V
12307 11A8..11F9    ; T
12308 END
12309 );
12310
12311     # The Hangul syllables in version 1 are completely different than what came
12312     # after, so just ignore them there.
12313     if ($v_version lt v2.0.0) {
12314         my $property = property_ref($file->property);
12315         push @tables_that_may_be_empty, $property->table('LV')->complete_name;
12316         push @tables_that_may_be_empty, $property->table('LVT')->complete_name;
12317         return;
12318     }
12319
12320     # The algorithmically derived syllables are almost all LVT ones, so
12321     # initialize the whole range with that.
12322     $file->insert_lines(sprintf "%04X..%04X; LVT\n",
12323                         $SBase, $SBase + $SCount -1);
12324
12325     # Those ones that aren't LVT are LV, and they occur at intervals of
12326     # $TCount code points, starting with the first code point, at $SBase.
12327     for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
12328         $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
12329     }
12330
12331     return;
12332 }
12333
12334 sub generate_GCB {
12335
12336     # Populates the Grapheme Cluster Break property from first principles
12337
12338     my $file= shift;
12339     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12340
12341     # All these definitions are from
12342     # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
12343     # from http://www.unicode.org/reports/tr29/tr29-4.html
12344
12345     foreach my $range ($gc->ranges) {
12346
12347         # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
12348         # and gc=Cf
12349         if ($range->value =~ / ^ M [en] $ /x) {
12350             $file->insert_lines(sprintf "%04X..%04X; Extend",
12351                                 $range->start,  $range->end);
12352         }
12353         elsif ($range->value =~ / ^ C [cf] $ /x) {
12354             $file->insert_lines(sprintf "%04X..%04X; Control",
12355                                 $range->start,  $range->end);
12356         }
12357     }
12358     $file->insert_lines("2028; Control"); # Line Separator
12359     $file->insert_lines("2029; Control"); # Paragraph Separator
12360
12361     $file->insert_lines("000D; CR");
12362     $file->insert_lines("000A; LF");
12363
12364     # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
12365     foreach my $code_point ( qw{
12366                                 40000
12367                                 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
12368                                 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
12369                                 }
12370     ) {
12371         my $category = $gc->value_of(hex $code_point);
12372         next if ! defined $category || $category eq 'Cn'; # But not if
12373                                                           # unassigned in this
12374                                                           # release
12375         $file->insert_lines("$code_point; Extend");
12376     }
12377
12378     my $hst = property_ref('Hangul_Syllable_Type');
12379     if ($hst->count > 0) {
12380         foreach my $range ($hst->ranges) {
12381             $file->insert_lines(sprintf "%04X..%04X; %s",
12382                                     $range->start, $range->end, $range->value);
12383         }
12384     }
12385     else {
12386         generate_hst($file);
12387     }
12388
12389     return;
12390 }
12391
12392 sub setup_early_name_alias {
12393     my $file= shift;
12394     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12395
12396     # This has the effect of pretending that the Name_Alias property was
12397     # available in all Unicode releases.  Strictly speaking, this property
12398     # should not be availabe in early releases, but doing this allows
12399     # charnames.pm to work on older releases without change.  Prior to v5.16
12400     # it had these names hard-coded inside it.  Unicode 6.1 came along and
12401     # created these names, and so they were removed from charnames.
12402
12403     my $aliases = property_ref('Name_Alias');
12404     if (! defined $aliases) {
12405         $aliases = Property->new('Name_Alias', Default_Map => "");
12406     }
12407
12408     $file->insert_lines(get_old_name_aliases());
12409
12410     return;
12411 }
12412
12413 sub get_old_name_aliases () {
12414
12415     # The Unicode_1_Name field, contains most of these names.  One would
12416     # expect, given the field's name, that its values would be fixed across
12417     # versions, giving the true Unicode version 1 name for the character.
12418     # Sadly, this is not the case.  Actually Version 1.1.5 had no names for
12419     # any of the controls; Version 2.0 introduced names for the C0 controls,
12420     # and 3.0 introduced C1 names.  3.0.1 removed the name INDEX; and 3.2
12421     # changed some names: it
12422     #   changed to parenthesized versions like "NEXT LINE" to
12423     #       "NEXT LINE (NEL)";
12424     #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
12425     #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
12426     #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
12427     # This list contains all the names that were defined so that
12428     # charnames::vianame(), etc. understand them all EVEN if this version of
12429     # Unicode didn't specify them (this could be construed as a bug).
12430     # mktables elsewhere gives preference to the Unicode_1_Name field over
12431     # these names, so that viacode() will return the correct value for that
12432     # version of Unicode, except when that version doesn't define a name,
12433     # viacode() will return one anyway (this also could be construed as a
12434     # bug).  But these potential "bugs" allow for the smooth working of code
12435     # on earlier Unicode releases.
12436
12437     my @return = split /\n/, <<'END';
12438 0000;NULL;control
12439 0000;NUL;abbreviation
12440 0001;START OF HEADING;control
12441 0001;SOH;abbreviation
12442 0002;START OF TEXT;control
12443 0002;STX;abbreviation
12444 0003;END OF TEXT;control
12445 0003;ETX;abbreviation
12446 0004;END OF TRANSMISSION;control
12447 0004;EOT;abbreviation
12448 0005;ENQUIRY;control
12449 0005;ENQ;abbreviation
12450 0006;ACKNOWLEDGE;control
12451 0006;ACK;abbreviation
12452 0007;BELL;control
12453 0007;BEL;abbreviation
12454 0008;BACKSPACE;control
12455 0008;BS;abbreviation
12456 0009;CHARACTER TABULATION;control
12457 0009;HORIZONTAL TABULATION;control
12458 0009;HT;abbreviation
12459 0009;TAB;abbreviation
12460 000A;LINE FEED;control
12461 000A;LINE FEED (LF);control
12462 000A;NEW LINE;control
12463 000A;END OF LINE;control
12464 000A;LF;abbreviation
12465 000A;NL;abbreviation
12466 000A;EOL;abbreviation
12467 000B;LINE TABULATION;control
12468 000B;VERTICAL TABULATION;control
12469 000B;VT;abbreviation
12470 000C;FORM FEED;control
12471 000C;FORM FEED (FF);control
12472 000C;FF;abbreviation
12473 000D;CARRIAGE RETURN;control
12474 000D;CARRIAGE RETURN (CR);control
12475 000D;CR;abbreviation
12476 000E;SHIFT OUT;control
12477 000E;LOCKING-SHIFT ONE;control
12478 000E;SO;abbreviation
12479 000F;SHIFT IN;control
12480 000F;LOCKING-SHIFT ZERO;control
12481 000F;SI;abbreviation
12482 0010;DATA LINK ESCAPE;control
12483 0010;DLE;abbreviation
12484 0011;DEVICE CONTROL ONE;control
12485 0011;DC1;abbreviation
12486 0012;DEVICE CONTROL TWO;control
12487 0012;DC2;abbreviation
12488 0013;DEVICE CONTROL THREE;control
12489 0013;DC3;abbreviation
12490 0014;DEVICE CONTROL FOUR;control
12491 0014;DC4;abbreviation
12492 0015;NEGATIVE ACKNOWLEDGE;control
12493 0015;NAK;abbreviation
12494 0016;SYNCHRONOUS IDLE;control
12495 0016;SYN;abbreviation
12496 0017;END OF TRANSMISSION BLOCK;control
12497 0017;ETB;abbreviation
12498 0018;CANCEL;control
12499 0018;CAN;abbreviation
12500 0019;END OF MEDIUM;control
12501 0019;EOM;abbreviation
12502 001A;SUBSTITUTE;control
12503 001A;SUB;abbreviation
12504 001B;ESCAPE;control
12505 001B;ESC;abbreviation
12506 001C;INFORMATION SEPARATOR FOUR;control
12507 001C;FILE SEPARATOR;control
12508 001C;FS;abbreviation
12509 001D;INFORMATION SEPARATOR THREE;control
12510 001D;GROUP SEPARATOR;control
12511 001D;GS;abbreviation
12512 001E;INFORMATION SEPARATOR TWO;control
12513 001E;RECORD SEPARATOR;control
12514 001E;RS;abbreviation
12515 001F;INFORMATION SEPARATOR ONE;control
12516 001F;UNIT SEPARATOR;control
12517 001F;US;abbreviation
12518 0020;SP;abbreviation
12519 007F;DELETE;control
12520 007F;DEL;abbreviation
12521 0080;PADDING CHARACTER;figment
12522 0080;PAD;abbreviation
12523 0081;HIGH OCTET PRESET;figment
12524 0081;HOP;abbreviation
12525 0082;BREAK PERMITTED HERE;control
12526 0082;BPH;abbreviation
12527 0083;NO BREAK HERE;control
12528 0083;NBH;abbreviation
12529 0084;INDEX;control
12530 0084;IND;abbreviation
12531 0085;NEXT LINE;control
12532 0085;NEXT LINE (NEL);control
12533 0085;NEL;abbreviation
12534 0086;START OF SELECTED AREA;control
12535 0086;SSA;abbreviation
12536 0087;END OF SELECTED AREA;control
12537 0087;ESA;abbreviation
12538 0088;CHARACTER TABULATION SET;control
12539 0088;HORIZONTAL TABULATION SET;control
12540 0088;HTS;abbreviation
12541 0089;CHARACTER TABULATION WITH JUSTIFICATION;control
12542 0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
12543 0089;HTJ;abbreviation
12544 008A;LINE TABULATION SET;control
12545 008A;VERTICAL TABULATION SET;control
12546 008A;VTS;abbreviation
12547 008B;PARTIAL LINE FORWARD;control
12548 008B;PARTIAL LINE DOWN;control
12549 008B;PLD;abbreviation
12550 008C;PARTIAL LINE BACKWARD;control
12551 008C;PARTIAL LINE UP;control
12552 008C;PLU;abbreviation
12553 008D;REVERSE LINE FEED;control
12554 008D;REVERSE INDEX;control
12555 008D;RI;abbreviation
12556 008E;SINGLE SHIFT TWO;control
12557 008E;SINGLE-SHIFT-2;control
12558 008E;SS2;abbreviation
12559 008F;SINGLE SHIFT THREE;control
12560 008F;SINGLE-SHIFT-3;control
12561 008F;SS3;abbreviation
12562 0090;DEVICE CONTROL STRING;control
12563 0090;DCS;abbreviation
12564 0091;PRIVATE USE ONE;control
12565 0091;PRIVATE USE-1;control
12566 0091;PU1;abbreviation
12567 0092;PRIVATE USE TWO;control
12568 0092;PRIVATE USE-2;control
12569 0092;PU2;abbreviation
12570 0093;SET TRANSMIT STATE;control
12571 0093;STS;abbreviation
12572 0094;CANCEL CHARACTER;control
12573 0094;CCH;abbreviation
12574 0095;MESSAGE WAITING;control
12575 0095;MW;abbreviation
12576 0096;START OF GUARDED AREA;control
12577 0096;START OF PROTECTED AREA;control
12578 0096;SPA;abbreviation
12579 0097;END OF GUARDED AREA;control
12580 0097;END OF PROTECTED AREA;control
12581 0097;EPA;abbreviation
12582 0098;START OF STRING;control
12583 0098;SOS;abbreviation
12584 0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
12585 0099;SGC;abbreviation
12586 009A;SINGLE CHARACTER INTRODUCER;control
12587 009A;SCI;abbreviation
12588 009B;CONTROL SEQUENCE INTRODUCER;control
12589 009B;CSI;abbreviation
12590 009C;STRING TERMINATOR;control
12591 009C;ST;abbreviation
12592 009D;OPERATING SYSTEM COMMAND;control
12593 009D;OSC;abbreviation
12594 009E;PRIVACY MESSAGE;control
12595 009E;PM;abbreviation
12596 009F;APPLICATION PROGRAM COMMAND;control
12597 009F;APC;abbreviation
12598 00A0;NBSP;abbreviation
12599 00AD;SHY;abbreviation
12600 200B;ZWSP;abbreviation
12601 200C;ZWNJ;abbreviation
12602 200D;ZWJ;abbreviation
12603 200E;LRM;abbreviation
12604 200F;RLM;abbreviation
12605 202A;LRE;abbreviation
12606 202B;RLE;abbreviation
12607 202C;PDF;abbreviation
12608 202D;LRO;abbreviation
12609 202E;RLO;abbreviation
12610 FEFF;BYTE ORDER MARK;alternate
12611 FEFF;BOM;abbreviation
12612 FEFF;ZWNBSP;abbreviation
12613 END
12614
12615     if ($v_version ge v3.0.0) {
12616         push @return, split /\n/, <<'END';
12617 180B; FVS1; abbreviation
12618 180C; FVS2; abbreviation
12619 180D; FVS3; abbreviation
12620 180E; MVS; abbreviation
12621 202F; NNBSP; abbreviation
12622 END
12623     }
12624
12625     if ($v_version ge v3.2.0) {
12626         push @return, split /\n/, <<'END';
12627 034F; CGJ; abbreviation
12628 205F; MMSP; abbreviation
12629 2060; WJ; abbreviation
12630 END
12631         # Add in VS1..VS16
12632         my $cp = 0xFE00 - 1;
12633         for my $i (1..16) {
12634             push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12635         }
12636     }
12637     if ($v_version ge v4.0.0) { # Add in VS17..VS256
12638         my $cp = 0xE0100 - 17;
12639         for my $i (17..256) {
12640             push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12641         }
12642     }
12643
12644     # ALERT did not come along until 6.0, at which point it became preferred
12645     # over BELL, and was never in the Unicode_1_Name field.  For the same
12646     # reasons, that the other names are made known to all releases by this
12647     # function, we make ALERT known too.  By inserting it
12648     # last in early releases, BELL is preferred over it; and vice-vers in 6.0
12649     my $alert = '0007; ALERT; control';
12650     if ($v_version lt v6.0.0) {
12651         push @return, $alert;
12652     }
12653     else {
12654         unshift @return, $alert;
12655     }
12656
12657     return @return;
12658 }
12659
12660 sub filter_later_version_name_alias_line {
12661
12662     # This file has an extra entry per line for the alias type.  This is
12663     # handled by creating a compound entry: "$alias: $type";  First, split
12664     # the line into components.
12665     my ($range, $alias, $type, @remainder)
12666         = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12667
12668     # This file contains multiple entries for some components, so tell the
12669     # downstream code to allow this in our internal tables; the
12670     # $MULTIPLE_AFTER preserves the input ordering.
12671     $_ = join ";", $range, $CMD_DELIM
12672                            . $REPLACE_CMD
12673                            . '='
12674                            . $MULTIPLE_AFTER
12675                            . $CMD_DELIM
12676                            . "$alias: $type",
12677                    @remainder;
12678     return;
12679 }
12680
12681 sub filter_early_version_name_alias_line {
12682
12683     # Early versions did not have the trailing alias type field; implicitly it
12684     # was 'correction'.   But our synthetic lines we add in this program do
12685     # have it, so test for the type field.
12686     $_ .= "; correction" if $_ !~ /;.*;/;
12687
12688     filter_later_version_name_alias_line;
12689     return;
12690 }
12691
12692 sub finish_Unicode() {
12693     # This routine should be called after all the Unicode files have been read
12694     # in.  It:
12695     # 1) Creates properties that are missing from the version of Unicode being
12696     #    compiled, and which, for whatever reason, are needed for the Perl
12697     #    core to function properly.  These are minimally populated as
12698     #    necessary.
12699     # 2) Adds the mappings for code points missing from the files which have
12700     #    defaults specified for them.
12701     # 3) At this this point all mappings are known, so it computes the type of
12702     #    each property whose type hasn't been determined yet.
12703     # 4) Calculates all the regular expression match tables based on the
12704     #    mappings.
12705     # 5) Calculates and adds the tables which are defined by Unicode, but
12706     #    which aren't derived by them, and certain derived tables that Perl
12707     #    uses.
12708
12709     # Folding information was introduced later into Unicode data.  To get
12710     # Perl's case ignore (/i) to work at all in releases that don't have
12711     # folding, use the best available alternative, which is lower casing.
12712     my $fold = property_ref('Case_Folding');
12713     if ($fold->is_empty) {
12714         $fold->initialize(property_ref('Lowercase_Mapping'));
12715         $fold->add_note(join_lines(<<END
12716 WARNING: This table uses lower case as a substitute for missing fold
12717 information
12718 END
12719         ));
12720     }
12721
12722     # Multiple-character mapping was introduced later into Unicode data, so it
12723     # is by default the simple version.  If to output the simple versions and
12724     # not present, just use the regular (which in these Unicode versions is
12725     # the simple as well).
12726     foreach my $map (qw {   Uppercase_Mapping
12727                             Lowercase_Mapping
12728                             Titlecase_Mapping
12729                             Case_Folding
12730                         } )
12731     {
12732         my $comment = <<END;
12733
12734 Note that although the Perl core uses this file, it has the standard values
12735 for code points from U+0000 to U+00FF compiled in, so changing this table will
12736 not change the core's behavior with respect to these code points.  Use
12737 Unicode::Casing to override this table.
12738 END
12739         if ($map eq 'Case_Folding') {
12740             $comment .= <<END;
12741 (/i regex matching is not overridable except by using a custom regex engine)
12742 END
12743         }
12744         property_ref($map)->add_comment(join_lines($comment));
12745         my $simple = property_ref("Simple_$map");
12746         next if ! $simple->is_empty;
12747         if ($simple->to_output_map) {
12748             $simple->initialize(property_ref($map));
12749         }
12750         else {
12751             property_ref($map)->set_proxy_for($simple->name);
12752         }
12753     }
12754
12755     # For each property, fill in any missing mappings, and calculate the re
12756     # match tables.  If a property has more than one missing mapping, the
12757     # default is a reference to a data structure, and requires data from other
12758     # properties to resolve.  The sort is used to cause these to be processed
12759     # last, after all the other properties have been calculated.
12760     # (Fortunately, the missing properties so far don't depend on each other.)
12761     foreach my $property
12762         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
12763         property_ref('*'))
12764     {
12765         # $perl has been defined, but isn't one of the Unicode properties that
12766         # need to be finished up.
12767         next if $property == $perl;
12768
12769         # Nor do we need to do anything with properties that aren't going to
12770         # be output.
12771         next if $property->fate == $SUPPRESSED;
12772
12773         # Handle the properties that have more than one possible default
12774         if (ref $property->default_map) {
12775             my $default_map = $property->default_map;
12776
12777             # These properties have stored in the default_map:
12778             # One or more of:
12779             #   1)  A default map which applies to all code points in a
12780             #       certain class
12781             #   2)  an expression which will evaluate to the list of code
12782             #       points in that class
12783             # And
12784             #   3) the default map which applies to every other missing code
12785             #      point.
12786             #
12787             # Go through each list.
12788             while (my ($default, $eval) = $default_map->get_next_defaults) {
12789
12790                 # Get the class list, and intersect it with all the so-far
12791                 # unspecified code points yielding all the code points
12792                 # in the class that haven't been specified.
12793                 my $list = eval $eval;
12794                 if ($@) {
12795                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
12796                     last;
12797                 }
12798
12799                 # Narrow down the list to just those code points we don't have
12800                 # maps for yet.
12801                 $list = $list & $property->inverse_list;
12802
12803                 # Add mappings to the property for each code point in the list
12804                 foreach my $range ($list->ranges) {
12805                     $property->add_map($range->start, $range->end, $default,
12806                     Replace => $CROAK);
12807                 }
12808             }
12809
12810             # All remaining code points have the other mapping.  Set that up
12811             # so the normal single-default mapping code will work on them
12812             $property->set_default_map($default_map->other_default);
12813
12814             # And fall through to do that
12815         }
12816
12817         # We should have enough data now to compute the type of the property.
12818         my $property_name = $property->name;
12819         $property->compute_type;
12820         my $property_type = $property->type;
12821
12822         next if ! $property->to_create_match_tables;
12823
12824         # Here want to create match tables for this property
12825
12826         # The Unicode db always (so far, and they claim into the future) have
12827         # the default for missing entries in binary properties be 'N' (unless
12828         # there is a '@missing' line that specifies otherwise)
12829         if (! defined $property->default_map) {
12830             if ($property_type == $BINARY) {
12831                 $property->set_default_map('N');
12832             }
12833             elsif ($property_type == $ENUM) {
12834                 Carp::my_carp("Property '$property_name doesn't have a default mapping.  Using a fake one");
12835                 $property->set_default_map('XXX This makes sure there is a default map');
12836             }
12837         }
12838
12839         # Add any remaining code points to the mapping, using the default for
12840         # missing code points.
12841         my $default_table;
12842         if (defined (my $default_map = $property->default_map)) {
12843
12844             # Make sure there is a match table for the default
12845             if (! defined ($default_table = $property->table($default_map))) {
12846                 $default_table = $property->add_match_table($default_map);
12847             }
12848
12849             # And, if the property is binary, the default table will just
12850             # be the complement of the other table.
12851             if ($property_type == $BINARY) {
12852                 my $non_default_table;
12853
12854                 # Find the non-default table.
12855                 for my $table ($property->tables) {
12856                     next if $table == $default_table;
12857                     $non_default_table = $table;
12858                 }
12859                 $default_table->set_complement($non_default_table);
12860             }
12861             else {
12862
12863                 # This fills in any missing values with the default.  It's not
12864                 # necessary to do this with binary properties, as the default
12865                 # is defined completely in terms of the Y table.
12866                 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
12867                                    $default_map, Replace => $NO);
12868             }
12869         }
12870
12871         # Have all we need to populate the match tables.
12872         my $maps_should_be_defined = $property->pre_declared_maps;
12873         foreach my $range ($property->ranges) {
12874             my $map = $range->value;
12875             my $table = $property->table($map);
12876             if (! defined $table) {
12877
12878                 # Integral and rational property values are not necessarily
12879                 # defined in PropValueAliases, but whether all the other ones
12880                 # should be depends on the property.
12881                 if ($maps_should_be_defined
12882                     && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
12883                 {
12884                     Carp::my_carp("Table '$property_name=$map' should have been defined.  Defining it now.")
12885                 }
12886                 $table = $property->add_match_table($map);
12887             }
12888
12889             next if $table->complement != 0;    # Don't need to populate these
12890             $table->add_range($range->start, $range->end);
12891         }
12892
12893         # A forced binary property has additional true/false tables which
12894         # should have been set up when it was forced into binary.  The false
12895         # table matches exactly the same set as the property's default table.
12896         # The true table matches the complement of that.  The false table is
12897         # not the same as an additional set of aliases on top of the default
12898         # table, so use 'set_equivalent_to'.  If it were implemented as
12899         # additional aliases, various things would have to be adjusted, but
12900         # especially, if the user wants to get a list of names for the table
12901         # using Unicode::UCD::prop_value_aliases(), s/he should get a
12902         # different set depending on whether they want the default table or
12903         # the false table.
12904         if ($property_type == $FORCED_BINARY) {
12905             $property->table('N')->set_equivalent_to($default_table,
12906                                                      Related => 1);
12907             $property->table('Y')->set_complement($default_table);
12908         }
12909
12910         # For Perl 5.6 compatibility, all properties matchable in regexes can
12911         # have an optional 'Is_' prefix.  This is now done in utf8_heavy.pl.
12912         # But warn if this creates a conflict with a (new) Unicode property
12913         # name, although it appears that Unicode has made a decision never to
12914         # begin a property name with 'Is_', so this shouldn't happen.
12915         foreach my $alias ($property->aliases) {
12916             my $Is_name = 'Is_' . $alias->name;
12917             if (defined (my $pre_existing = property_ref($Is_name))) {
12918                 Carp::my_carp(<<END
12919 There is already an alias named $Is_name (from " . $pre_existing . "), so
12920 creating one for $property won't work.  This is bad news.  If it is not too
12921 late, get Unicode to back off.  Otherwise go back to the old scheme (findable
12922 from the git blame log for this area of the code that suppressed individual
12923 aliases that conflict with the new Unicode names.  Proceeding anyway.
12924 END
12925                 );
12926             }
12927         } # End of loop through aliases for this property
12928     } # End of loop through all Unicode properties.
12929
12930     # Fill in the mappings that Unicode doesn't completely furnish.  First the
12931     # single letter major general categories.  If Unicode were to start
12932     # delivering the values, this would be redundant, but better that than to
12933     # try to figure out if should skip and not get it right.  Ths could happen
12934     # if a new major category were to be introduced, and the hard-coded test
12935     # wouldn't know about it.
12936     # This routine depends on the standard names for the general categories
12937     # being what it thinks they are, like 'Cn'.  The major categories are the
12938     # union of all the general category tables which have the same first
12939     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
12940     foreach my $minor_table ($gc->tables) {
12941         my $minor_name = $minor_table->name;
12942         next if length $minor_name == 1;
12943         if (length $minor_name != 2) {
12944             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
12945             next;
12946         }
12947
12948         my $major_name = uc(substr($minor_name, 0, 1));
12949         my $major_table = $gc->table($major_name);
12950         $major_table += $minor_table;
12951     }
12952
12953     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
12954     # defines it as LC)
12955     my $LC = $gc->table('LC');
12956     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
12957     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
12958
12959
12960     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
12961                          # deliver the correct values in it
12962         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
12963
12964         # Lt not in release 1.
12965         if (defined $gc->table('Lt')) {
12966             $LC += $gc->table('Lt');
12967             $gc->table('Lt')->set_caseless_equivalent($LC);
12968         }
12969     }
12970     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
12971
12972     $gc->table('Ll')->set_caseless_equivalent($LC);
12973     $gc->table('Lu')->set_caseless_equivalent($LC);
12974
12975     my $Cs = $gc->table('Cs');
12976
12977     # Create digit and case fold tables with the original file names for
12978     # backwards compatibility with applications that read them directly.
12979     my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
12980                               Default_Map => "",
12981                               File => 'Digit',    # Trad. location
12982                               Directory => $map_directory,
12983                               Type => $STRING,
12984                               Replacement_Property => "Perl_Decimal_Digit",
12985                               Initialize => property_ref('Perl_Decimal_Digit'),
12986                             );
12987     $Digit->add_comment(join_lines(<<END
12988 This file gives the mapping of all code points which represent a single
12989 decimal digit [0-9] to their respective digits.  For example, the code point
12990 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
12991 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
12992 numerals.
12993 END
12994     ));
12995
12996     Property->new('Legacy_Case_Folding',
12997                     File => "Fold",
12998                     Directory => $map_directory,
12999                     Default_Map => $CODE_POINT,
13000                     Type => $STRING,
13001                     Replacement_Property => "Case_Folding",
13002                     Format => $HEX_FORMAT,
13003                     Initialize => property_ref('cf'),
13004     );
13005
13006     # The Script_Extensions property started out as a clone of the Script
13007     # property.  But processing its data file caused some elements to be
13008     # replaced with different data.  (These elements were for the Common and
13009     # Inherited properties.)  This data is a qw() list of all the scripts that
13010     # the code points in the given range are in.  An example line is:
13011     # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
13012     #
13013     # The code above has created a new match table named "Arab Syrc Thaa"
13014     # which contains 060C.  (The cloned table started out with this code point
13015     # mapping to "Common".)  Now we add 060C to each of the Arab, Syrc, and
13016     # Thaa match tables.  Then we delete the now spurious "Arab Syrc Thaa"
13017     # match table.  This is repeated for all these tables and ranges.  The map
13018     # data is retained in the map table for reference, but the spurious match
13019     # tables are deleted.
13020
13021     my $scx = property_ref("Script_Extensions");
13022     if (defined $scx) {
13023         foreach my $table ($scx->tables) {
13024             next unless $table->name =~ /\s/;   # All the new and only the new
13025                                                 # tables have a space in their
13026                                                 # names
13027             my @scripts = split /\s+/, $table->name;
13028             foreach my $script (@scripts) {
13029                 my $script_table = $scx->table($script);
13030                 $script_table += $table;
13031             }
13032             $scx->delete_match_table($table);
13033         }
13034     }
13035
13036     return;
13037 }
13038
13039 sub pre_3_dot_1_Nl () {
13040
13041     # Return a range list for gc=nl for Unicode versions prior to 3.1, which
13042     # is when Unicode's became fully usable.  These code points were
13043     # determined by inspection and experimentation.  gc=nl is important for
13044     # certain Perl-extension properties that should be available in all
13045     # releases.
13046
13047     my $Nl = Range_List->new();
13048     if (defined (my $official = $gc->table('Nl'))) {
13049         $Nl += $official;
13050     }
13051     else {
13052         $Nl->add_range(0x2160, 0x2182);
13053         $Nl->add_range(0x3007, 0x3007);
13054         $Nl->add_range(0x3021, 0x3029);
13055     }
13056     $Nl->add_range(0xFE20, 0xFE23);
13057     $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
13058                                                             # these were added
13059     return $Nl;
13060 }
13061
13062 sub compile_perl() {
13063     # Create perl-defined tables.  Almost all are part of the pseudo-property
13064     # named 'perl' internally to this program.  Many of these are recommended
13065     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
13066     # on those found there.
13067     # Almost all of these are equivalent to some Unicode property.
13068     # A number of these properties have equivalents restricted to the ASCII
13069     # range, with their names prefaced by 'Posix', to signify that these match
13070     # what the Posix standard says they should match.  A couple are
13071     # effectively this, but the name doesn't have 'Posix' in it because there
13072     # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
13073     # to the full Unicode range, by our guesses as to what is appropriate.
13074
13075     # 'Any' is all code points.  As an error check, instead of just setting it
13076     # to be that, construct it to be the union of all the major categories
13077     $Any = $perl->add_match_table('Any',
13078             Description  => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
13079             Matches_All => 1);
13080
13081     foreach my $major_table ($gc->tables) {
13082
13083         # Major categories are the ones with single letter names.
13084         next if length($major_table->name) != 1;
13085
13086         $Any += $major_table;
13087     }
13088
13089     if ($Any->max != $MAX_UNICODE_CODEPOINT) {
13090         Carp::my_carp_bug("Generated highest code point ("
13091            . sprintf("%X", $Any->max)
13092            . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
13093     }
13094     if ($Any->range_count != 1 || $Any->min != 0) {
13095      Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
13096     }
13097
13098     $Any->add_alias('All');
13099
13100     # Assigned is the opposite of gc=unassigned
13101     my $Assigned = $perl->add_match_table('Assigned',
13102                                 Description  => "All assigned code points",
13103                                 Initialize => ~ $gc->table('Unassigned'),
13104                                 );
13105
13106     # Our internal-only property should be treated as more than just a
13107     # synonym; grandfather it in to the pod.
13108     $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
13109                             Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
13110             ->set_equivalent_to(property_ref('ccc')->table('Above'),
13111                                                                 Related => 1);
13112
13113     my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
13114     if (defined $block) {   # This is equivalent to the block if have it.
13115         my $Unicode_ASCII = $block->table('Basic_Latin');
13116         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
13117             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
13118         }
13119     }
13120
13121     # Very early releases didn't have blocks, so initialize ASCII ourselves if
13122     # necessary
13123     if ($ASCII->is_empty) {
13124         if (! NON_ASCII_PLATFORM) {
13125             $ASCII->add_range(0, 127);
13126         }
13127         else {
13128             for my $i (0 .. 127) {
13129                 $ASCII->add_range(utf8::unicode_to_native($i),
13130                                   utf8::unicode_to_native($i));
13131             }
13132         }
13133     }
13134
13135     # Get the best available case definitions.  Early Unicode versions didn't
13136     # have Uppercase and Lowercase defined, so use the general category
13137     # instead for them, modified by hard-coding in the code points each is
13138     # missing.
13139     my $Lower = $perl->add_match_table('Lower');
13140     my $Unicode_Lower = property_ref('Lowercase');
13141     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
13142         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
13143
13144     }
13145     else {
13146         $Lower += $gc->table('Lowercase_Letter');
13147
13148         # There are quite a few code points in Lower, that aren't in gc=lc,
13149         # and not all are in all releases.
13150         foreach my $code_point (    utf8::unicode_to_native(0xAA),
13151                                     utf8::unicode_to_native(0xBA),
13152                                     0x02B0 .. 0x02B8,
13153                                     0x02C0 .. 0x02C1,
13154                                     0x02E0 .. 0x02E4,
13155                                     0x0345,
13156                                     0x037A,
13157                                     0x1D2C .. 0x1D6A,
13158                                     0x1D78,
13159                                     0x1D9B .. 0x1DBF,
13160                                     0x2071,
13161                                     0x207F,
13162                                     0x2090 .. 0x209C,
13163                                     0x2170 .. 0x217F,
13164                                     0x24D0 .. 0x24E9,
13165                                     0x2C7C .. 0x2C7D,
13166                                     0xA770,
13167                                     0xA7F8 .. 0xA7F9,
13168         ) {
13169             # Don't include the code point unless it is assigned in this
13170             # release
13171             my $category = $gc->value_of(hex $code_point);
13172             next if ! defined $category || $category eq 'Cn';
13173
13174             $Lower += $code_point;
13175         }
13176     }
13177     $Lower->add_alias('XPosixLower');
13178     my $Posix_Lower = $perl->add_match_table("PosixLower",
13179                             Description => "[a-z]",
13180                             Initialize => $Lower & $ASCII,
13181                             );
13182
13183     my $Upper = $perl->add_match_table('Upper');
13184     my $Unicode_Upper = property_ref('Uppercase');
13185     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
13186         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
13187     }
13188     else {
13189
13190         # Unlike Lower, there are only two ranges in Upper that aren't in
13191         # gc=Lu, and all code points were assigned in all releases.
13192         $Upper += $gc->table('Uppercase_Letter');
13193         $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
13194         $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
13195     }
13196     $Upper->add_alias('XPosixUpper');
13197     my $Posix_Upper = $perl->add_match_table("PosixUpper",
13198                             Description => "[A-Z]",
13199                             Initialize => $Upper & $ASCII,
13200                             );
13201
13202     # Earliest releases didn't have title case.  Initialize it to empty if not
13203     # otherwise present
13204     my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
13205                                        Description => '(= \p{Gc=Lt})');
13206     my $lt = $gc->table('Lt');
13207
13208     # Earlier versions of mktables had this related to $lt since they have
13209     # identical code points, but their caseless equivalents are not the same,
13210     # one being 'Cased' and the other being 'LC', and so now must be kept as
13211     # separate entities.
13212     if (defined $lt) {
13213         $Title += $lt;
13214     }
13215     else {
13216         push @tables_that_may_be_empty, $Title->complete_name;
13217     }
13218
13219     my $Unicode_Cased = property_ref('Cased');
13220     if (defined $Unicode_Cased) {
13221         my $yes = $Unicode_Cased->table('Y');
13222         my $no = $Unicode_Cased->table('N');
13223         $Title->set_caseless_equivalent($yes);
13224         if (defined $Unicode_Upper) {
13225             $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
13226             $Unicode_Upper->table('N')->set_caseless_equivalent($no);
13227         }
13228         $Upper->set_caseless_equivalent($yes);
13229         if (defined $Unicode_Lower) {
13230             $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
13231             $Unicode_Lower->table('N')->set_caseless_equivalent($no);
13232         }
13233         $Lower->set_caseless_equivalent($yes);
13234     }
13235     else {
13236         # If this Unicode version doesn't have Cased, set up the Perl
13237         # extension from first principles.  From Unicode 5.1: Definition D120:
13238         # A character C is defined to be cased if and only if C has the
13239         # Lowercase or Uppercase property or has a General_Category value of
13240         # Titlecase_Letter.
13241         my $cased = $perl->add_match_table('Cased',
13242                         Initialize => $Lower + $Upper + $Title,
13243                         Description => 'Uppercase or Lowercase or Titlecase',
13244                         );
13245         # $notcased is purely for the caseless equivalents below
13246         my $notcased = $perl->add_match_table('_Not_Cased',
13247                                 Initialize => ~ $cased,
13248                                 Fate => $INTERNAL_ONLY,
13249                                 Description => 'All not-cased code points');
13250         $Title->set_caseless_equivalent($cased);
13251         if (defined $Unicode_Upper) {
13252             $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
13253             $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
13254         }
13255         $Upper->set_caseless_equivalent($cased);
13256         if (defined $Unicode_Lower) {
13257             $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
13258             $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
13259         }
13260         $Lower->set_caseless_equivalent($cased);
13261     }
13262
13263     # Similarly, set up our own Case_Ignorable property if this Unicode
13264     # version doesn't have it.  From Unicode 5.1: Definition D121: A character
13265     # C is defined to be case-ignorable if C has the value MidLetter or the
13266     # value MidNumLet for the Word_Break property or its General_Category is
13267     # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
13268     # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
13269
13270     # Perl has long had an internal-only alias for this property; grandfather
13271     # it in to the pod, but discourage its use.
13272     my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
13273                                                      Re_Pod_Entry => 1,
13274                                                      Fate => $INTERNAL_ONLY,
13275                                                      Status => $DISCOURAGED);
13276     my $case_ignorable = property_ref('Case_Ignorable');
13277     if (defined $case_ignorable && ! $case_ignorable->is_empty) {
13278         $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
13279                                                                 Related => 1);
13280     }
13281     else {
13282
13283         $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
13284
13285         # The following three properties are not in early releases
13286         $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
13287         $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
13288         $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
13289
13290         # For versions 4.1 - 5.0, there is no MidNumLet property, and
13291         # correspondingly the case-ignorable definition lacks that one.  For
13292         # 4.0, it appears that it was meant to be the same definition, but was
13293         # inadvertently omitted from the standard's text, so add it if the
13294         # property actually is there
13295         my $wb = property_ref('Word_Break');
13296         if (defined $wb) {
13297             my $midlet = $wb->table('MidLetter');
13298             $perl_case_ignorable += $midlet if defined $midlet;
13299             my $midnumlet = $wb->table('MidNumLet');
13300             $perl_case_ignorable += $midnumlet if defined $midnumlet;
13301         }
13302         else {
13303
13304             # In earlier versions of the standard, instead of the above two
13305             # properties , just the following characters were used:
13306             $perl_case_ignorable +=
13307                             ord("'")
13308                         +   utf8::unicode_to_native(0xAD)  # SOFT HYPHEN (SHY)
13309                         +   0x2019; # RIGHT SINGLE QUOTATION MARK
13310         }
13311     }
13312
13313     # The remaining perl defined tables are mostly based on Unicode TR 18,
13314     # "Annex C: Compatibility Properties".  All of these have two versions,
13315     # one whose name generally begins with Posix that is posix-compliant, and
13316     # one that matches Unicode characters beyond the Posix, ASCII range
13317
13318     my $Alpha = $perl->add_match_table('Alpha');
13319
13320     # Alphabetic was not present in early releases
13321     my $Alphabetic = property_ref('Alphabetic');
13322     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
13323         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
13324     }
13325     else {
13326
13327         # The Alphabetic property doesn't exist for early releases, so
13328         # generate it.  The actual definition, in 5.2 terms is:
13329         #
13330         # gc=L + gc=Nl + Other_Alphabetic
13331         #
13332         # Other_Alphabetic is also not defined in these early releases, but it
13333         # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
13334         # those last two as well, then subtract the relatively few of them that
13335         # shouldn't have been added.  (The gc=So range is the circled capital
13336         # Latin characters.  Early releases mistakenly didn't also include the
13337         # lower-case versions of these characters, and so we don't either, to
13338         # maintain consistency with those releases that first had this
13339         # property.
13340         $Alpha->initialize($gc->table('Letter')
13341                            + pre_3_dot_1_Nl()
13342                            + $gc->table('Mn')
13343                            + $gc->table('Mc')
13344                         );
13345         $Alpha->add_range(0x24D0, 0x24E9);  # gc=So
13346         foreach my $range (     [ 0x0300, 0x0344 ],
13347                                 [ 0x0346, 0x034E ],
13348                                 [ 0x0360, 0x0362 ],
13349                                 [ 0x0483, 0x0486 ],
13350                                 [ 0x0591, 0x05AF ],
13351                                 [ 0x06DF, 0x06E0 ],
13352                                 [ 0x06EA, 0x06EC ],
13353                                 [ 0x0740, 0x074A ],
13354                                 0x093C,
13355                                 0x094D,
13356                                 [ 0x0951, 0x0954 ],
13357                                 0x09BC,
13358                                 0x09CD,
13359                                 0x0A3C,
13360                                 0x0A4D,
13361                                 0x0ABC,
13362                                 0x0ACD,
13363                                 0x0B3C,
13364                                 0x0B4D,
13365                                 0x0BCD,
13366                                 0x0C4D,
13367                                 0x0CCD,
13368                                 0x0D4D,
13369                                 0x0DCA,
13370                                 [ 0x0E47, 0x0E4C ],
13371                                 0x0E4E,
13372                                 [ 0x0EC8, 0x0ECC ],
13373                                 [ 0x0F18, 0x0F19 ],
13374                                 0x0F35,
13375                                 0x0F37,
13376                                 0x0F39,
13377                                 [ 0x0F3E, 0x0F3F ],
13378                                 [ 0x0F82, 0x0F84 ],
13379                                 [ 0x0F86, 0x0F87 ],
13380                                 0x0FC6,
13381                                 0x1037,
13382                                 0x1039,
13383                                 [ 0x17C9, 0x17D3 ],
13384                                 [ 0x20D0, 0x20DC ],
13385                                 0x20E1,
13386                                 [ 0x302A, 0x302F ],
13387                                 [ 0x3099, 0x309A ],
13388                                 [ 0xFE20, 0xFE23 ],
13389                                 [ 0x1D165, 0x1D169 ],
13390                                 [ 0x1D16D, 0x1D172 ],
13391                                 [ 0x1D17B, 0x1D182 ],
13392                                 [ 0x1D185, 0x1D18B ],
13393                                 [ 0x1D1AA, 0x1D1AD ],
13394         ) {
13395             if (ref $range) {
13396                 $Alpha->delete_range($range->[0], $range->[1]);
13397             }
13398             else {
13399                 $Alpha->delete_range($range, $range);
13400             }
13401         }
13402         $Alpha->add_description('Alphabetic');
13403         $Alpha->add_alias('Alphabetic');
13404     }
13405     $Alpha->add_alias('XPosixAlpha');
13406     my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
13407                             Description => "[A-Za-z]",
13408                             Initialize => $Alpha & $ASCII,
13409                             );
13410     $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
13411     $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
13412
13413     my $Alnum = $perl->add_match_table('Alnum',
13414                         Description => 'Alphabetic and (decimal) Numeric',
13415                         Initialize => $Alpha + $gc->table('Decimal_Number'),
13416                         );
13417     $Alnum->add_alias('XPosixAlnum');
13418     $perl->add_match_table("PosixAlnum",
13419                             Description => "[A-Za-z0-9]",
13420                             Initialize => $Alnum & $ASCII,
13421                             );
13422
13423     my $Word = $perl->add_match_table('Word',
13424                                 Description => '\w, including beyond ASCII;'
13425                                             . ' = \p{Alnum} + \pM + \p{Pc}',
13426                                 Initialize => $Alnum + $gc->table('Mark'),
13427                                 );
13428     $Word->add_alias('XPosixWord');
13429     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
13430     if (defined $Pc) {
13431         $Word += $Pc;
13432     }
13433     else {
13434         $Word += ord('_');  # Make sure this is a $Word
13435     }
13436     my $JC = property_ref('Join_Control');  # Wasn't in release 1
13437     if (defined $JC) {
13438         $Word += $JC->table('Y');
13439     }
13440     else {
13441         $Word += 0x200C + 0x200D;
13442     }
13443
13444     # This is a Perl extension, so the name doesn't begin with Posix.
13445     my $PerlWord = $perl->add_match_table('PerlWord',
13446                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
13447                     Initialize => $Word & $ASCII,
13448                     );
13449     $PerlWord->add_alias('PosixWord');
13450
13451     my $Blank = $perl->add_match_table('Blank',
13452                                 Description => '\h, Horizontal white space',
13453
13454                                 # 200B is Zero Width Space which is for line
13455                                 # break control, and was listed as
13456                                 # Space_Separator in early releases
13457                                 Initialize => $gc->table('Space_Separator')
13458                                             +   ord("\t")
13459                                             -   0x200B, # ZWSP
13460                                 );
13461     $Blank->add_alias('HorizSpace');        # Another name for it.
13462     $Blank->add_alias('XPosixBlank');
13463     $perl->add_match_table("PosixBlank",
13464                             Description => "\\t and ' '",
13465                             Initialize => $Blank & $ASCII,
13466                             );
13467
13468     my $VertSpace = $perl->add_match_table('VertSpace',
13469                             Description => '\v',
13470                             Initialize =>
13471                                $gc->table('Line_Separator')
13472                              + $gc->table('Paragraph_Separator')
13473                              + utf8::unicode_to_native(0x0A)  # LINE FEED
13474                              + utf8::unicode_to_native(0x0B)  # VERTICAL TAB
13475                              + ord("\f")
13476                              + utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
13477                              + utf8::unicode_to_native(0x85)  # NEL
13478                     );
13479     # No Posix equivalent for vertical space
13480
13481     my $Space = $perl->add_match_table('Space',
13482                 Description => '\s including beyond ASCII and vertical tab',
13483                 Initialize => $Blank + $VertSpace,
13484     );
13485     $Space->add_alias('XPosixSpace');
13486     my $posix_space = $perl->add_match_table("PosixSpace",
13487                             Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
13488                             Initialize => $Space & $ASCII,
13489                             );
13490
13491     # Perl's traditional space doesn't include Vertical Tab prior to v5.18
13492     my $XPerlSpace = $perl->add_match_table('XPerlSpace',
13493                                   Description => '\s, including beyond ASCII',
13494                                   Initialize => $Space,
13495                                   #Initialize => $Space
13496                                   # - utf8::unicode_to_native(0x0B]
13497                                 );
13498     $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
13499     my $PerlSpace = $perl->add_match_table('PerlSpace',
13500                         Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
13501                         Initialize => $XPerlSpace & $ASCII,
13502                             );
13503
13504
13505     my $Cntrl = $perl->add_match_table('Cntrl',
13506                                         Description => 'Control characters');
13507     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
13508     $Cntrl->add_alias('XPosixCntrl');
13509     $perl->add_match_table("PosixCntrl",
13510                             Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
13511                             Initialize => $Cntrl & $ASCII,
13512                             );
13513
13514     # $controls is a temporary used to construct Graph.
13515     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
13516                                                 + $gc->table('Control'));
13517     # Cs not in release 1
13518     $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
13519
13520     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
13521     my $Graph = $perl->add_match_table('Graph',
13522                         Description => 'Characters that are graphical',
13523                         Initialize => ~ ($Space + $controls),
13524                         );
13525     $Graph->add_alias('XPosixGraph');
13526     $perl->add_match_table("PosixGraph",
13527                             Description =>
13528                                 '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
13529                             Initialize => $Graph & $ASCII,
13530                             );
13531
13532     $print = $perl->add_match_table('Print',
13533                         Description => 'Characters that are graphical plus space characters (but no controls)',
13534                         Initialize => $Blank + $Graph - $gc->table('Control'),
13535                         );
13536     $print->add_alias('XPosixPrint');
13537     $perl->add_match_table("PosixPrint",
13538                             Description =>
13539                               '[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
13540                             Initialize => $print & $ASCII,
13541                             );
13542
13543     my $Punct = $perl->add_match_table('Punct');
13544     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
13545
13546     # \p{punct} doesn't include the symbols, which posix does
13547     my $XPosixPunct = $perl->add_match_table('XPosixPunct',
13548                     Description => '\p{Punct} + ASCII-range \p{Symbol}',
13549                     Initialize => $gc->table('Punctuation')
13550                                 + ($ASCII & $gc->table('Symbol')),
13551                                 Perl_Extension => 1
13552         );
13553     $perl->add_match_table('PosixPunct', Perl_Extension => 1,
13554         Description => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
13555         Initialize => $ASCII & $XPosixPunct,
13556         );
13557
13558     my $Digit = $perl->add_match_table('Digit',
13559                             Description => '[0-9] + all other decimal digits');
13560     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
13561     $Digit->add_alias('XPosixDigit');
13562     my $PosixDigit = $perl->add_match_table("PosixDigit",
13563                                             Description => '[0-9]',
13564                                             Initialize => $Digit & $ASCII,
13565                                             );
13566
13567     # Hex_Digit was not present in first release
13568     my $Xdigit = $perl->add_match_table('XDigit');
13569     $Xdigit->add_alias('XPosixXDigit');
13570     my $Hex = property_ref('Hex_Digit');
13571     if (defined $Hex && ! $Hex->is_empty) {
13572         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
13573     }
13574     else {
13575         $Xdigit->initialize([ ord('0') .. ord('9'),
13576                               ord('A') .. ord('F'),
13577                               ord('a') .. ord('f'),
13578                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
13579         $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
13580     }
13581
13582     # AHex was not present in early releases
13583     my $PosixXDigit = $perl->add_match_table('PosixXDigit');
13584     my $AHex = property_ref('ASCII_Hex_Digit');
13585     if (defined $AHex && ! $AHex->is_empty) {
13586         $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
13587     }
13588     else {
13589         $PosixXDigit->initialize($Xdigit & $ASCII);
13590         $PosixXDigit->add_alias('AHex');
13591         $PosixXDigit->add_alias('Ascii_Hex_Digit');
13592     }
13593     $PosixXDigit->add_description('[0-9A-Fa-f]');
13594
13595     my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
13596                     Description => "Code points that particpate in some fold",
13597                     );
13598     #
13599     foreach my $range (property_ref('Case_Folding')->ranges) {
13600         $any_folds->add_range($range->start, $range->end);
13601         foreach my $hex_code_point (split " ", $range->value) {
13602             my $code_point = hex $hex_code_point;
13603             $any_folds->add_range($code_point, $code_point);
13604         }
13605     }
13606
13607     my $dt = property_ref('Decomposition_Type');
13608     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
13609         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
13610         Perl_Extension => 1,
13611         Note => 'Union of all non-canonical decompositions',
13612         );
13613
13614     # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
13615     # than SD appeared, construct it ourselves, based on the first release SD
13616     # was in.  A pod entry is grandfathered in for it
13617     my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
13618                                            Perl_Extension => 1,
13619                                            Fate => $INTERNAL_ONLY,
13620                                            Status => $DISCOURAGED);
13621     my $soft_dotted = property_ref('Soft_Dotted');
13622     if (defined $soft_dotted && ! $soft_dotted->is_empty) {
13623         $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
13624     }
13625     else {
13626
13627         # This list came from 3.2 Soft_Dotted; all of these code points are in
13628         # all releases
13629         $CanonDCIJ->initialize([ ord('i'),
13630                                  ord('j'),
13631                                  0x012F,
13632                                  0x0268,
13633                                  0x0456,
13634                                  0x0458,
13635                                  0x1E2D,
13636                                  0x1ECB,
13637                                ]);
13638         $CanonDCIJ = $CanonDCIJ & $Assigned;
13639     }
13640
13641     # For backward compatibility, Perl has its own definition for IDStart.
13642     # It is regular XID_Start plus the underscore, but all characters must be
13643     # Word characters as well
13644     my $XID_Start = property_ref('XID_Start');
13645     my $perl_xids = $perl->add_match_table('_Perl_IDStart',
13646                                             Perl_Extension => 1,
13647                                             Fate => $INTERNAL_ONLY,
13648                                             Initialize => ord('_')
13649                                             );
13650     if (defined $XID_Start
13651         || defined ($XID_Start = property_ref('ID_Start')))
13652     {
13653         $perl_xids += $XID_Start->table('Y');
13654     }
13655     else {
13656         # For Unicode versions that don't have the property, construct our own
13657         # from first principles.  The actual definition is:
13658         #     Letters
13659         #   + letter numbers (Nl)
13660         #   - Pattern_Syntax
13661         #   - Pattern_White_Space
13662         #   + stability extensions
13663         #   - NKFC modifications
13664         #
13665         # What we do in the code below is to include the identical code points
13666         # that are in the first release that had Unicode's version of this
13667         # property, essentially extrapolating backwards.  There were no
13668         # stability extensions until v4.1, so none are included; likewise in
13669         # no Unicode version so far do subtracting PatSyn and PatWS make any
13670         # difference, so those also are ignored.
13671         $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
13672
13673         # We do subtract the NFKC modifications that are in the first version
13674         # that had this property.  We don't bother to test if they are in the
13675         # version in question, because if they aren't, the operation is a
13676         # no-op.  The NKFC modifications are discussed in
13677         # http://www.unicode.org/reports/tr31/#NFKC_Modifications
13678         foreach my $range ( 0x037A,
13679                             0x0E33,
13680                             0x0EB3,
13681                             [ 0xFC5E, 0xFC63 ],
13682                             [ 0xFDFA, 0xFE70 ],
13683                             [ 0xFE72, 0xFE76 ],
13684                             0xFE78,
13685                             0xFE7A,
13686                             0xFE7C,
13687                             0xFE7E,
13688                             [ 0xFF9E, 0xFF9F ],
13689         ) {
13690             if (ref $range) {
13691                 $perl_xids->delete_range($range->[0], $range->[1]);
13692             }
13693             else {
13694                 $perl_xids->delete_range($range, $range);
13695             }
13696         }
13697     }
13698
13699     $perl_xids &= $Word;
13700
13701     my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
13702                                         Perl_Extension => 1,
13703                                         Fate => $INTERNAL_ONLY);
13704     my $XIDC = property_ref('XID_Continue');
13705     if (defined $XIDC
13706         || defined ($XIDC = property_ref('ID_Continue')))
13707     {
13708         $perl_xidc += $XIDC->table('Y');
13709     }
13710     else {
13711         # Similarly, we construct our own XIDC if necessary for early Unicode
13712         # versions.  The definition is:
13713         #     everything in XIDS
13714         #   + Gc=Mn
13715         #   + Gc=Mc
13716         #   + Gc=Nd
13717         #   + Gc=Pc
13718         #   - Pattern_Syntax
13719         #   - Pattern_White_Space
13720         #   + stability extensions
13721         #   - NFKC modifications
13722         #
13723         # The same thing applies to this as with XIDS for the PatSyn, PatWS,
13724         # and stability extensions.  There is a somewhat different set of NFKC
13725         # mods to remove (and add in this case).  The ones below make this
13726         # have identical code points as in the first release that defined it.
13727         $perl_xidc += $perl_xids
13728                     + $gc->table('L')
13729                     + $gc->table('Mn')
13730                     + $gc->table('Mc')
13731                     + $gc->table('Nd')
13732                     + utf8::unicode_to_native(0xB7)
13733                     ;
13734         if (defined (my $pc = $gc->table('Pc'))) {
13735             $perl_xidc += $pc;
13736         }
13737         else {  # 1.1.5 didn't have Pc, but these should have been in it
13738             $perl_xidc += 0xFF3F;
13739             $perl_xidc->add_range(0x203F, 0x2040);
13740             $perl_xidc->add_range(0xFE33, 0xFE34);
13741             $perl_xidc->add_range(0xFE4D, 0xFE4F);
13742         }
13743
13744         # Subtract the NFKC mods
13745         foreach my $range ( 0x037A,
13746                             [ 0xFC5E, 0xFC63 ],
13747                             [ 0xFDFA, 0xFE1F ],
13748                             0xFE70,
13749                             [ 0xFE72, 0xFE76 ],
13750                             0xFE78,
13751                             0xFE7A,
13752                             0xFE7C,
13753                             0xFE7E,
13754         ) {
13755             if (ref $range) {
13756                 $perl_xidc->delete_range($range->[0], $range->[1]);
13757             }
13758             else {
13759                 $perl_xidc->delete_range($range, $range);
13760             }
13761         }
13762     }
13763
13764     $perl_xidc &= $Word;
13765
13766     my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
13767                     Perl_Extension => 1,
13768                     Fate => $INTERNAL_ONLY,
13769                     Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
13770                     );
13771
13772     my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
13773                         Perl_Extension => 1,
13774                         Fate => $INTERNAL_ONLY,
13775                         Initialize => $perl_xidc
13776                                     + ord(" ")
13777                                     + ord("(")
13778                                     + ord(")")
13779                                     + ord("-")
13780                                     + utf8::unicode_to_native(0xA0) # NBSP
13781                         );
13782
13783     # These two tables are for matching \X, which is based on the 'extended'
13784     # grapheme cluster, which came in 5.1; create empty ones if not already
13785     # present.  The straight 'grapheme cluster' (non-extended) is used prior
13786     # to 5.1, and differs from the extended (see
13787     # http://www.unicode.org/reports/tr29/) only by these two tables, so we
13788     # get the older definition automatically when they are empty.
13789     my $gcb = property_ref('Grapheme_Cluster_Break');
13790     my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
13791                                         Perl_Extension => 1,
13792                                         Fate => $INTERNAL_ONLY);
13793     if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
13794         $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
13795     }
13796     else {
13797         push @tables_that_may_be_empty, $perl_prepend->complete_name;
13798     }
13799
13800     # All the tables with _X_ in their names are used in defining \X handling,
13801     # and are based on the Unicode GCB property.  Basically, \X matches:
13802     #   CR LF
13803     #   | Prepend* Begin Extend*
13804     #   | .
13805     # Begin is:           ( Special_Begin | ! Control )
13806     # Begin is also:      ( Regular_Begin | Special_Begin )
13807     #   where Regular_Begin is defined as ( ! Control - Special_Begin )
13808     # Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
13809     # Extend is:          ( Grapheme_Extend | Spacing_Mark )
13810     # Control is:         [ GCB_Control | CR | LF ]
13811     # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
13812
13813     foreach my $gcb_name (qw{ L V T LV LVT }) {
13814
13815         # The perl internal extension's name is the gcb table name prepended
13816         # with an '_X_'
13817         my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
13818                                         Perl_Extension => 1,
13819                                         Fate => $INTERNAL_ONLY,
13820                                         Initialize => $gcb->table($gcb_name),
13821                                         );
13822         # Version 1 had mostly different Hangul syllables that were removed
13823         # from later versions, so some of the tables may not apply.
13824         if ($v_version lt v2.0) {
13825             push @tables_that_may_be_empty, $perl_table->complete_name;
13826         }
13827     }
13828
13829     # More GCB.  Populate a combined hangul syllables table
13830     my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
13831                                           Perl_Extension => 1,
13832                                           Fate => $INTERNAL_ONLY);
13833     $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
13834     $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
13835
13836     my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
13837                                     Fate => $INTERNAL_ONLY);
13838     if ($v_version ge v6.2) {
13839         $ri += $gcb->table('RI');
13840     }
13841     else {
13842         push @tables_that_may_be_empty, $ri->full_name;
13843     }
13844
13845     my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start',
13846                                        Perl_Extension => 1,
13847                                        Fate => $INTERNAL_ONLY,
13848                                        Initialize => $lv_lvt_v
13849                                                    + $gcb->table('L')
13850                                                    + $gcb->table('T')
13851                                                    + $ri
13852                                       );
13853     $specials_begin->add_comment(join_lines( <<END
13854 For use in \\X; matches first (perhaps only) character of potential
13855 multi-character sequences that can begin an extended grapheme cluster.  They
13856 need special handling because of their complicated nature.
13857 END
13858     ));
13859     my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
13860                                        Perl_Extension => 1,
13861                                        Fate => $INTERNAL_ONLY,
13862                                        Initialize => ~ $gcb->table('Control')
13863                                                    - $specials_begin
13864                                                    - $gcb->table('CR')
13865                                                    - $gcb->table('LF')
13866                                       );
13867     $regular_begin->add_comment(join_lines( <<END
13868 For use in \\X; matches first character of anything that can begin an extended
13869 grapheme cluster, except those that require special handling.
13870 END
13871     ));
13872
13873     my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
13874                                         Fate => $INTERNAL_ONLY,
13875                                         Initialize => $gcb->table('Extend')
13876                                        );
13877     if (defined (my $sm = $gcb->table('SpacingMark'))) {
13878         $extend += $sm;
13879     }
13880     $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
13881
13882     # End of GCB \X processing
13883
13884     my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
13885
13886     if (@named_sequences) {
13887         push @composition, 'Named_Sequence';
13888         foreach my $sequence (@named_sequences) {
13889             $perl_charname->add_anomalous_entry($sequence);
13890         }
13891     }
13892
13893     my $alias_sentence = "";
13894     my %abbreviations;
13895     my $alias = property_ref('Name_Alias');
13896     $perl_charname->set_proxy_for('Name_Alias');
13897
13898     # Add each entry in Name_Alias to Perl_Charnames.  Where these go with
13899     # respect to any existing entry depends on the entry type.  Corrections go
13900     # before said entry, as they should be returned in preference over the
13901     # existing entry.  (A correction to a correction should be later in the
13902     # Name_Alias table, so it will correctly precede the erroneous correction
13903     # in Perl_Charnames.)
13904     #
13905     # Abbreviations go after everything else, so they are saved temporarily in
13906     # a hash for later.
13907     #
13908     # Everything else is added added afterwards, which preserves the input
13909     # ordering
13910
13911     foreach my $range ($alias->ranges) {
13912         next if $range->value eq "";
13913         my $code_point = $range->start;
13914         if ($code_point != $range->end) {
13915             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
13916         }
13917         my ($value, $type) = split ': ', $range->value;
13918         my $replace_type;
13919         if ($type eq 'correction') {
13920             $replace_type = $MULTIPLE_BEFORE;
13921         }
13922         elsif ($type eq 'abbreviation') {
13923
13924             # Save for later
13925             $abbreviations{$value} = $code_point;
13926             next;
13927         }
13928         else {
13929             $replace_type = $MULTIPLE_AFTER;
13930         }
13931
13932         # Actually add; before or after current entry(ies) as determined
13933         # above.
13934
13935         $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
13936     }
13937     $alias_sentence = <<END;
13938 The Name_Alias property adds duplicate code point entries that are
13939 alternatives to the original name.  If an addition is a corrected
13940 name, it will be physically first in the table.  The original (less correct,
13941 but still valid) name will be next; then any alternatives, in no particular
13942 order; and finally any abbreviations, again in no particular order.
13943 END
13944
13945     # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
13946     # precedence before 6.1, so should be first in the file; the other names
13947     # have precedence starting in 6.1,
13948     my $before_or_after = ($v_version lt v6.1.0)
13949                           ? $MULTIPLE_BEFORE
13950                           : $MULTIPLE_AFTER;
13951
13952     foreach my $range (property_ref('Unicode_1_Name')->ranges) {
13953         my $code_point = $range->start;
13954         my $unicode_1_value = $range->value;
13955         next if $unicode_1_value eq "";     # Skip if name doesn't exist.
13956
13957         if ($code_point != $range->end) {
13958             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
13959         }
13960
13961         # To handle EBCDIC, we don't hard code in the code points of the
13962         # controls; instead realizing that all of them are below 256.
13963         last if $code_point > 255;
13964
13965         # We only add in the controls.
13966         next if $gc->value_of($code_point) ne 'Cc';
13967
13968         # We reject this Unicode1 name for later Perls, as it is used for
13969         # another code point
13970         next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
13971
13972         # This won't add an exact duplicate.
13973         $perl_charname->add_duplicate($code_point, $unicode_1_value,
13974                                         Replace => $before_or_after);
13975     }
13976
13977     # But in this version only, the ALERT has precedence over BELL, the
13978     # Unicode_1_Name that would otherwise have precedence.
13979     if ($v_version eq v6.0.0) {
13980         $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE);
13981     }
13982
13983     # Now that have everything added, add in abbreviations after
13984     # everything else.  Sort so results don't change between runs of this
13985     # program
13986     foreach my $value (sort keys %abbreviations) {
13987         $perl_charname->add_duplicate($abbreviations{$value}, $value,
13988                                         Replace => $MULTIPLE_AFTER);
13989     }
13990
13991     my $comment;
13992     if (@composition <= 2) { # Always at least 2
13993         $comment = join " and ", @composition;
13994     }
13995     else {
13996         $comment = join ", ", @composition[0 .. scalar @composition - 2];
13997         $comment .= ", and $composition[-1]";
13998     }
13999
14000     $perl_charname->add_comment(join_lines( <<END
14001 This file is for charnames.pm.  It is the union of the $comment properties.
14002 Unicode_1_Name entries are used only for nameless code points in the Name
14003 property.
14004 $alias_sentence
14005 This file doesn't include the algorithmically determinable names.  For those,
14006 use 'unicore/Name.pm'
14007 END
14008     ));
14009     property_ref('Name')->add_comment(join_lines( <<END
14010 This file doesn't include the algorithmically determinable names.  For those,
14011 use 'unicore/Name.pm'
14012 END
14013     ));
14014
14015     # Construct the Present_In property from the Age property.
14016     if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
14017         my $default_map = $age->default_map;
14018         my $in = Property->new('In',
14019                                 Default_Map => $default_map,
14020                                 Full_Name => "Present_In",
14021                                 Perl_Extension => 1,
14022                                 Type => $ENUM,
14023                                 Initialize => $age,
14024                                 );
14025         $in->add_comment(join_lines(<<END
14026 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
14027 same as for $age, and not for what $in really means.  This is because anything
14028 defined in a given release should have multiple values: that release and all
14029 higher ones.  But only one value per code point can be represented in a table
14030 like this.
14031 END
14032         ));
14033
14034         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
14035         # lowest numbered (earliest) come first, with the non-numeric one
14036         # last.
14037         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
14038                                             ? 1
14039                                             : ($b->name !~ /^[\d.]*$/)
14040                                                 ? -1
14041                                                 : $a->name <=> $b->name
14042                                             } $age->tables;
14043
14044         # The Present_In property is the cumulative age properties.  The first
14045         # one hence is identical to the first age one.
14046         my $previous_in = $in->add_match_table($first_age->name);
14047         $previous_in->set_equivalent_to($first_age, Related => 1);
14048
14049         my $description_start = "Code point's usage introduced in version ";
14050         $first_age->add_description($description_start . $first_age->name);
14051
14052         # To construct the accumulated values, for each of the age tables
14053         # starting with the 2nd earliest, merge the earliest with it, to get
14054         # all those code points existing in the 2nd earliest.  Repeat merging
14055         # the new 2nd earliest with the 3rd earliest to get all those existing
14056         # in the 3rd earliest, and so on.
14057         foreach my $current_age (@rest_ages) {
14058             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
14059
14060             my $current_in = $in->add_match_table(
14061                                     $current_age->name,
14062                                     Initialize => $current_age + $previous_in,
14063                                     Description => $description_start
14064                                                     . $current_age->name
14065                                                     . ' or earlier',
14066                                     );
14067             $previous_in = $current_in;
14068
14069             # Add clarifying material for the corresponding age file.  This is
14070             # in part because of the confusing and contradictory information
14071             # given in the Standard's documentation itself, as of 5.2.
14072             $current_age->add_description(
14073                             "Code point's usage was introduced in version "
14074                             . $current_age->name);
14075             $current_age->add_note("See also $in");
14076
14077         }
14078
14079         # And finally the code points whose usages have yet to be decided are
14080         # the same in both properties.  Note that permanently unassigned code
14081         # points actually have their usage assigned (as being permanently
14082         # unassigned), so that these tables are not the same as gc=cn.
14083         my $unassigned = $in->add_match_table($default_map);
14084         my $age_default = $age->table($default_map);
14085         $age_default->add_description(<<END
14086 Code point's usage has not been assigned in any Unicode release thus far.
14087 END
14088         );
14089         $unassigned->set_equivalent_to($age_default, Related => 1);
14090     }
14091
14092     # See L<perlfunc/quotemeta>
14093     my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
14094                                            Perl_Extension => 1,
14095                                            Fate => $INTERNAL_ONLY,
14096
14097                                            # Initialize to what's common in
14098                                            # all Unicode releases.
14099                                            Initialize =>
14100                                                 $Space
14101                                                 + $gc->table('Control')
14102                            );
14103
14104     # In early releases without the proper Unicode properties, just set to \W.
14105     if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
14106         || ! defined (my $patws = property_ref('Pattern_White_Space'))
14107         || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
14108     {
14109         $quotemeta += ~ $Word;
14110     }
14111     else {
14112         $quotemeta += $patsyn->table('Y')
14113                    + $patws->table('Y')
14114                    + $di->table('Y')
14115                    + ((~ $Word) & $ASCII);
14116     }
14117
14118     # Finished creating all the perl properties.  All non-internal non-string
14119     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
14120     # an underscore.)  These do not get a separate entry in the pod file
14121     foreach my $table ($perl->tables) {
14122         foreach my $alias ($table->aliases) {
14123             next if $alias->name =~ /^_/;
14124             $table->add_alias('Is_' . $alias->name,
14125                                Re_Pod_Entry => 0,
14126                                UCD => 0,
14127                                Status => $alias->status,
14128                                OK_as_Filename => 0);
14129         }
14130     }
14131
14132     # Here done with all the basic stuff.  Ready to populate the information
14133     # about each character if annotating them.
14134     if ($annotate) {
14135
14136         # See comments at its declaration
14137         $annotate_ranges = Range_Map->new;
14138
14139         # This separates out the non-characters from the other unassigneds, so
14140         # can give different annotations for each.
14141         $unassigned_sans_noncharacters = Range_List->new(
14142                                     Initialize => $gc->table('Unassigned'));
14143         if (defined (my $nonchars = property_ref('Noncharacter_Code_Point'))) {
14144             $unassigned_sans_noncharacters &= $nonchars->table('N');
14145         }
14146
14147         for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
14148             $i = populate_char_info($i);    # Note sets $i so may cause skips
14149         }
14150     }
14151
14152     return;
14153 }
14154
14155 sub add_perl_synonyms() {
14156     # A number of Unicode tables have Perl synonyms that are expressed in
14157     # the single-form, \p{name}.  These are:
14158     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
14159     #       \p{Is_Name} as synonyms
14160     #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
14161     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
14162     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
14163     #       conflict, \p{Value} and \p{Is_Value} as well
14164     #
14165     # This routine generates these synonyms, warning of any unexpected
14166     # conflicts.
14167
14168     # Construct the list of tables to get synonyms for.  Start with all the
14169     # binary and the General_Category ones.
14170     my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
14171                                                             property_ref('*');
14172     push @tables, $gc->tables;
14173
14174     # If the version of Unicode includes the Script property, add its tables
14175     push @tables, $script->tables if defined $script;
14176
14177     # The Block tables are kept separate because they are treated differently.
14178     # And the earliest versions of Unicode didn't include them, so add only if
14179     # there are some.
14180     my @blocks;
14181     push @blocks, $block->tables if defined $block;
14182
14183     # Here, have the lists of tables constructed.  Process blocks last so that
14184     # if there are name collisions with them, blocks have lowest priority.
14185     # Should there ever be other collisions, manual intervention would be
14186     # required.  See the comments at the beginning of the program for a
14187     # possible way to handle those semi-automatically.
14188     foreach my $table (@tables,  @blocks) {
14189
14190         # For non-binary properties, the synonym is just the name of the
14191         # table, like Greek, but for binary properties the synonym is the name
14192         # of the property, and means the code points in its 'Y' table.
14193         my $nominal = $table;
14194         my $nominal_property = $nominal->property;
14195         my $actual;
14196         if (! $nominal->isa('Property')) {
14197             $actual = $table;
14198         }
14199         else {
14200
14201             # Here is a binary property.  Use the 'Y' table.  Verify that is
14202             # there
14203             my $yes = $nominal->table('Y');
14204             unless (defined $yes) {  # Must be defined, but is permissible to
14205                                      # be empty.
14206                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
14207                 next;
14208             }
14209             $actual = $yes;
14210         }
14211
14212         foreach my $alias ($nominal->aliases) {
14213
14214             # Attempt to create a table in the perl directory for the
14215             # candidate table, using whatever aliases in it that don't
14216             # conflict.  Also add non-conflicting aliases for all these
14217             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
14218             PREFIX:
14219             foreach my $prefix ("", 'Is_', 'In_') {
14220
14221                 # Only Block properties can have added 'In_' aliases.
14222                 next if $prefix eq 'In_' and $nominal_property != $block;
14223
14224                 my $proposed_name = $prefix . $alias->name;
14225
14226                 # No Is_Is, In_In, nor combinations thereof
14227                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
14228                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
14229
14230                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
14231
14232                 # Get a reference to any existing table in the perl
14233                 # directory with the desired name.
14234                 my $pre_existing = $perl->table($proposed_name);
14235
14236                 if (! defined $pre_existing) {
14237
14238                     # No name collision, so ok to add the perl synonym.
14239
14240                     my $make_re_pod_entry;
14241                     my $ok_as_filename;
14242                     my $status = $alias->status;
14243                     if ($nominal_property == $block) {
14244
14245                         # For block properties, the 'In' form is preferred for
14246                         # external use; the pod file contains wild cards for
14247                         # this and the 'Is' form so no entries for those; and
14248                         # we don't want people using the name without the
14249                         # 'In', so discourage that.
14250                         if ($prefix eq "") {
14251                             $make_re_pod_entry = 1;
14252                             $status = $status || $DISCOURAGED;
14253                             $ok_as_filename = 0;
14254                         }
14255                         elsif ($prefix eq 'In_') {
14256                             $make_re_pod_entry = 0;
14257                             $status = $status || $NORMAL;
14258                             $ok_as_filename = 1;
14259                         }
14260                         else {
14261                             $make_re_pod_entry = 0;
14262                             $status = $status || $DISCOURAGED;
14263                             $ok_as_filename = 0;
14264                         }
14265                     }
14266                     elsif ($prefix ne "") {
14267
14268                         # The 'Is' prefix is handled in the pod by a wild
14269                         # card, and we won't use it for an external name
14270                         $make_re_pod_entry = 0;
14271                         $status = $status || $NORMAL;
14272                         $ok_as_filename = 0;
14273                     }
14274                     else {
14275
14276                         # Here, is an empty prefix, non block.  This gets its
14277                         # own pod entry and can be used for an external name.
14278                         $make_re_pod_entry = 1;
14279                         $status = $status || $NORMAL;
14280                         $ok_as_filename = 1;
14281                     }
14282
14283                     # Here, there isn't a perl pre-existing table with the
14284                     # name.  Look through the list of equivalents of this
14285                     # table to see if one is a perl table.
14286                     foreach my $equivalent ($actual->leader->equivalents) {
14287                         next if $equivalent->property != $perl;
14288
14289                         # Here, have found a table for $perl.  Add this alias
14290                         # to it, and are done with this prefix.
14291                         $equivalent->add_alias($proposed_name,
14292                                         Re_Pod_Entry => $make_re_pod_entry,
14293
14294                                         # Currently don't output these in the
14295                                         # ucd pod, as are strongly discouraged
14296                                         # from being used
14297                                         UCD => 0,
14298
14299                                         Status => $status,
14300                                         OK_as_Filename => $ok_as_filename);
14301                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
14302                         next PREFIX;
14303                     }
14304
14305                     # Here, $perl doesn't already have a table that is a
14306                     # synonym for this property, add one.
14307                     my $added_table = $perl->add_match_table($proposed_name,
14308                                             Re_Pod_Entry => $make_re_pod_entry,
14309
14310                                             # See UCD comment just above
14311                                             UCD => 0,
14312
14313                                             Status => $status,
14314                                             OK_as_Filename => $ok_as_filename);
14315                     # And it will be related to the actual table, since it is
14316                     # based on it.
14317                     $added_table->set_equivalent_to($actual, Related => 1);
14318                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
14319                     next;
14320                 } # End of no pre-existing.
14321
14322                 # Here, there is a pre-existing table that has the proposed
14323                 # name.  We could be in trouble, but not if this is just a
14324                 # synonym for another table that we have already made a child
14325                 # of the pre-existing one.
14326                 if ($pre_existing->is_set_equivalent_to($actual)) {
14327                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
14328                     $pre_existing->add_alias($proposed_name);
14329                     next;
14330                 }
14331
14332                 # Here, there is a name collision, but it still could be ok if
14333                 # the tables match the identical set of code points, in which
14334                 # case, we can combine the names.  Compare each table's code
14335                 # point list to see if they are identical.
14336                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
14337                 if ($pre_existing->matches_identically_to($actual)) {
14338
14339                     # Here, they do match identically.  Not a real conflict.
14340                     # Make the perl version a child of the Unicode one, except
14341                     # in the non-obvious case of where the perl name is
14342                     # already a synonym of another Unicode property.  (This is
14343                     # excluded by the test for it being its own parent.)  The
14344                     # reason for this exclusion is that then the two Unicode
14345                     # properties become related; and we don't really know if
14346                     # they are or not.  We generate documentation based on
14347                     # relatedness, and this would be misleading.  Code
14348                     # later executed in the process will cause the tables to
14349                     # be represented by a single file anyway, without making
14350                     # it look in the pod like they are necessarily related.
14351                     if ($pre_existing->parent == $pre_existing
14352                         && ($pre_existing->property == $perl
14353                             || $actual->property == $perl))
14354                     {
14355                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
14356                         $pre_existing->set_equivalent_to($actual, Related => 1);
14357                     }
14358                     elsif (main::DEBUG && $to_trace) {
14359                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
14360                         trace $pre_existing->parent;
14361                     }
14362                     next PREFIX;
14363                 }
14364
14365                 # Here they didn't match identically, there is a real conflict
14366                 # between our new name and a pre-existing property.
14367                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
14368                 $pre_existing->add_conflicting($nominal->full_name,
14369                                                'p',
14370                                                $actual);
14371
14372                 # Don't output a warning for aliases for the block
14373                 # properties (unless they start with 'In_') as it is
14374                 # expected that there will be conflicts and the block
14375                 # form loses.
14376                 if ($verbosity >= $NORMAL_VERBOSITY
14377                     && ($actual->property != $block || $prefix eq 'In_'))
14378                 {
14379                     print simple_fold(join_lines(<<END
14380 There is already an alias named $proposed_name (from $pre_existing),
14381 so not creating this alias for $actual
14382 END
14383                     ), "", 4);
14384                 }
14385
14386                 # Keep track for documentation purposes.
14387                 $has_In_conflicts++ if $prefix eq 'In_';
14388                 $has_Is_conflicts++ if $prefix eq 'Is_';
14389             }
14390         }
14391     }
14392
14393     # There are some properties which have No and Yes (and N and Y) as
14394     # property values, but aren't binary, and could possibly be confused with
14395     # binary ones.  So create caveats for them.  There are tables that are
14396     # named 'No', and tables that are named 'N', but confusion is not likely
14397     # unless they are the same table.  For example, N meaning Number or
14398     # Neutral is not likely to cause confusion, so don't add caveats to things
14399     # like them.
14400     foreach my $property (grep { $_->type != $BINARY
14401                                  && $_->type != $FORCED_BINARY }
14402                                                             property_ref('*'))
14403     {
14404         my $yes = $property->table('Yes');
14405         if (defined $yes) {
14406             my $y = $property->table('Y');
14407             if (defined $y && $yes == $y) {
14408                 foreach my $alias ($property->aliases) {
14409                     $yes->add_conflicting($alias->name);
14410                 }
14411             }
14412         }
14413         my $no = $property->table('No');
14414         if (defined $no) {
14415             my $n = $property->table('N');
14416             if (defined $n && $no == $n) {
14417                 foreach my $alias ($property->aliases) {
14418                     $no->add_conflicting($alias->name, 'P');
14419                 }
14420             }
14421         }
14422     }
14423
14424     return;
14425 }
14426
14427 sub register_file_for_name($$$) {
14428     # Given info about a table and a datafile that it should be associated
14429     # with, register that association
14430
14431     my $table = shift;
14432     my $directory_ref = shift;   # Array of the directory path for the file
14433     my $file = shift;            # The file name in the final directory.
14434     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14435
14436     trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
14437
14438     if ($table->isa('Property')) {
14439         $table->set_file_path(@$directory_ref, $file);
14440         push @map_properties, $table;
14441
14442         # No swash means don't do the rest of this.
14443         return if $table->fate != $ORDINARY;
14444
14445         # Get the path to the file
14446         my @path = $table->file_path;
14447
14448         # Use just the file name if no subdirectory.
14449         shift @path if $path[0] eq File::Spec->curdir();
14450
14451         my $file = join '/', @path;
14452
14453         # Create a hash entry for utf8_heavy to get the file that stores this
14454         # property's map table
14455         foreach my $alias ($table->aliases) {
14456             my $name = $alias->name;
14457             $loose_property_to_file_of{standardize($name)} = $file;
14458         }
14459
14460         # And a way for utf8_heavy to find the proper key in the SwashInfo
14461         # hash for this property.
14462         $file_to_swash_name{$file} = "To" . $table->swash_name;
14463         return;
14464     }
14465
14466     # Do all of the work for all equivalent tables when called with the leader
14467     # table, so skip if isn't the leader.
14468     return if $table->leader != $table;
14469
14470     # If this is a complement of another file, use that other file instead,
14471     # with a ! prepended to it.
14472     my $complement;
14473     if (($complement = $table->complement) != 0) {
14474         my @directories = $complement->file_path;
14475
14476         # This assumes that the 0th element is something like 'lib',
14477         # the 1th element the property name (in its own directory), like
14478         # 'AHex', and the 2th element the file like 'Y' which will have a .pl
14479         # appended to it later.
14480         $directories[1] =~ s/^/!/;
14481         $file = pop @directories;
14482         $directory_ref =\@directories;
14483     }
14484
14485     # Join all the file path components together, using slashes.
14486     my $full_filename = join('/', @$directory_ref, $file);
14487
14488     # All go in the same subdirectory of unicore
14489     if ($directory_ref->[0] ne $matches_directory) {
14490         Carp::my_carp("Unexpected directory in "
14491                 .  join('/', @{$directory_ref}, $file));
14492     }
14493
14494     # For this table and all its equivalents ...
14495     foreach my $table ($table, $table->equivalents) {
14496
14497         # Associate it with its file internally.  Don't include the
14498         # $matches_directory first component
14499         $table->set_file_path(@$directory_ref, $file);
14500
14501         # No swash means don't do the rest of this.
14502         next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
14503
14504         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
14505
14506         my $property = $table->property;
14507         my $property_name = ($property == $perl)
14508                              ? ""  # 'perl' is never explicitly stated
14509                              : standardize($property->name) . '=';
14510
14511         my $is_default = 0; # Is this table the default one for the property?
14512
14513         # To calculate $is_default, we find if this table is the same as the
14514         # default one for the property.  But this is complicated by the
14515         # possibility that there is a master table for this one, and the
14516         # information is stored there instead of here.
14517         my $parent = $table->parent;
14518         my $leader_prop = $parent->property;
14519         my $default_map = $leader_prop->default_map;
14520         if (defined $default_map) {
14521             my $default_table = $leader_prop->table($default_map);
14522             $is_default = 1 if defined $default_table && $parent == $default_table;
14523         }
14524
14525         # Calculate the loose name for this table.  Mostly it's just its name,
14526         # standardized.  But in the case of Perl tables that are single-form
14527         # equivalents to Unicode properties, it is the latter's name.
14528         my $loose_table_name =
14529                         ($property != $perl || $leader_prop == $perl)
14530                         ? standardize($table->name)
14531                         : standardize($parent->name);
14532
14533         my $deprecated = ($table->status eq $DEPRECATED)
14534                          ? $table->status_info
14535                          : "";
14536         my $caseless_equivalent = $table->caseless_equivalent;
14537
14538         # And for each of the table's aliases...  This inner loop eventually
14539         # goes through all aliases in the UCD that we generate regex match
14540         # files for
14541         foreach my $alias ($table->aliases) {
14542             my $standard = utf8_heavy_name($table, $alias);
14543
14544             # Generate an entry in either the loose or strict hashes, which
14545             # will translate the property and alias names combination into the
14546             # file where the table for them is stored.
14547             if ($alias->loose_match) {
14548                 if (exists $loose_to_file_of{$standard}) {
14549                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
14550                 }
14551                 else {
14552                     $loose_to_file_of{$standard} = $sub_filename;
14553                 }
14554             }
14555             else {
14556                 if (exists $stricter_to_file_of{$standard}) {
14557                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
14558                 }
14559                 else {
14560                     $stricter_to_file_of{$standard} = $sub_filename;
14561
14562                     # Tightly coupled with how utf8_heavy.pl works, for a
14563                     # floating point number that is a whole number, get rid of
14564                     # the trailing decimal point and 0's, so that utf8_heavy
14565                     # will work.  Also note that this assumes that such a
14566                     # number is matched strictly; so if that were to change,
14567                     # this would be wrong.
14568                     if ((my $integer_name = $alias->name)
14569                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
14570                     {
14571                         $stricter_to_file_of{$property_name . $integer_name}
14572                                                             = $sub_filename;
14573                     }
14574                 }
14575             }
14576
14577             # For Unicode::UCD, create a mapping of the prop=value to the
14578             # canonical =value for that property.
14579             if ($standard =~ /=/) {
14580
14581                 # This could happen if a strict name mapped into an existing
14582                 # loose name.  In that event, the strict names would have to
14583                 # be moved to a new hash.
14584                 if (exists($loose_to_standard_value{$standard})) {
14585                     Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
14586                 }
14587                 $loose_to_standard_value{$standard} = $loose_table_name;
14588             }
14589
14590             # Keep a list of the deprecated properties and their filenames
14591             if ($deprecated && $complement == 0) {
14592                 $utf8::why_deprecated{$sub_filename} = $deprecated;
14593             }
14594
14595             # And a substitute table, if any, for case-insensitive matching
14596             if ($caseless_equivalent != 0) {
14597                 $caseless_equivalent_to{$standard} = $caseless_equivalent;
14598             }
14599
14600             # Add to defaults list if the table this alias belongs to is the
14601             # default one
14602             $loose_defaults{$standard} = 1 if $is_default;
14603         }
14604     }
14605
14606     return;
14607 }
14608
14609 {   # Closure
14610     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
14611                      # conflicts
14612     my %full_dir_name_of;   # Full length names of directories used.
14613
14614     sub construct_filename($$$) {
14615         # Return a file name for a table, based on the table name, but perhaps
14616         # changed to get rid of non-portable characters in it, and to make
14617         # sure that it is unique on a file system that allows the names before
14618         # any period to be at most 8 characters (DOS).  While we're at it
14619         # check and complain if there are any directory conflicts.
14620
14621         my $name = shift;       # The name to start with
14622         my $mutable = shift;    # Boolean: can it be changed?  If no, but
14623                                 # yet it must be to work properly, a warning
14624                                 # is given
14625         my $directories_ref = shift;  # A reference to an array containing the
14626                                 # path to the file, with each element one path
14627                                 # component.  This is used because the same
14628                                 # name can be used in different directories.
14629         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14630
14631         my $warn = ! defined wantarray;  # If true, then if the name is
14632                                 # changed, a warning is issued as well.
14633
14634         if (! defined $name) {
14635             Carp::my_carp("Undefined name in directory "
14636                           . File::Spec->join(@$directories_ref)
14637                           . ". '_' used");
14638             return '_';
14639         }
14640
14641         # Make sure that no directory names conflict with each other.  Look at
14642         # each directory in the input file's path.  If it is already in use,
14643         # assume it is correct, and is merely being re-used, but if we
14644         # truncate it to 8 characters, and find that there are two directories
14645         # that are the same for the first 8 characters, but differ after that,
14646         # then that is a problem.
14647         foreach my $directory (@$directories_ref) {
14648             my $short_dir = substr($directory, 0, 8);
14649             if (defined $full_dir_name_of{$short_dir}) {
14650                 next if $full_dir_name_of{$short_dir} eq $directory;
14651                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
14652             }
14653             else {
14654                 $full_dir_name_of{$short_dir} = $directory;
14655             }
14656         }
14657
14658         my $path = join '/', @$directories_ref;
14659         $path .= '/' if $path;
14660
14661         # Remove interior underscores.
14662         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
14663
14664         # Change any non-word character into an underscore, and truncate to 8.
14665         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
14666         substr($filename, 8) = "" if length($filename) > 8;
14667
14668         # Make sure the basename doesn't conflict with something we
14669         # might have already written. If we have, say,
14670         #     InGreekExtended1
14671         #     InGreekExtended2
14672         # they become
14673         #     InGreekE
14674         #     InGreek2
14675         my $warned = 0;
14676         while (my $num = $base_names{$path}{lc $filename}++) {
14677             $num++; # so basenames with numbers start with '2', which
14678                     # just looks more natural.
14679
14680             # Want to append $num, but if it'll make the basename longer
14681             # than 8 characters, pre-truncate $filename so that the result
14682             # is acceptable.
14683             my $delta = length($filename) + length($num) - 8;
14684             if ($delta > 0) {
14685                 substr($filename, -$delta) = $num;
14686             }
14687             else {
14688                 $filename .= $num;
14689             }
14690             if ($warn && ! $warned) {
14691                 $warned = 1;
14692                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
14693             }
14694         }
14695
14696         return $filename if $mutable;
14697
14698         # If not changeable, must return the input name, but warn if needed to
14699         # change it beyond shortening it.
14700         if ($name ne $filename
14701             && substr($name, 0, length($filename)) ne $filename) {
14702             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
14703         }
14704         return $name;
14705     }
14706 }
14707
14708 # The pod file contains a very large table.  Many of the lines in that table
14709 # would exceed a typical output window's size, and so need to be wrapped with
14710 # a hanging indent to make them look good.  The pod language is really
14711 # insufficient here.  There is no general construct to do that in pod, so it
14712 # is done here by beginning each such line with a space to cause the result to
14713 # be output without formatting, and doing all the formatting here.  This leads
14714 # to the result that if the eventual display window is too narrow it won't
14715 # look good, and if the window is too wide, no advantage is taken of that
14716 # extra width.  A further complication is that the output may be indented by
14717 # the formatter so that there is less space than expected.  What I (khw) have
14718 # done is to assume that that indent is a particular number of spaces based on
14719 # what it is in my Linux system;  people can always resize their windows if
14720 # necessary, but this is obviously less than desirable, but the best that can
14721 # be expected.
14722 my $automatic_pod_indent = 8;
14723
14724 # Try to format so that uses fewest lines, but few long left column entries
14725 # slide into the right column.  An experiment on 5.1 data yielded the
14726 # following percentages that didn't cut into the other side along with the
14727 # associated first-column widths
14728 # 69% = 24
14729 # 80% not too bad except for a few blocks
14730 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
14731 # 95% = 37;
14732 my $indent_info_column = 27;    # 75% of lines didn't have overlap
14733
14734 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
14735                     # The 3 is because of:
14736                     #   1   for the leading space to tell the pod formatter to
14737                     #       output as-is
14738                     #   1   for the flag
14739                     #   1   for the space between the flag and the main data
14740
14741 sub format_pod_line ($$$;$$) {
14742     # Take a pod line and return it, formatted properly
14743
14744     my $first_column_width = shift;
14745     my $entry = shift;  # Contents of left column
14746     my $info = shift;   # Contents of right column
14747
14748     my $status = shift || "";   # Any flag
14749
14750     my $loose_match = shift;    # Boolean.
14751     $loose_match = 1 unless defined $loose_match;
14752
14753     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14754
14755     my $flags = "";
14756     $flags .= $STRICTER if ! $loose_match;
14757
14758     $flags .= $status if $status;
14759
14760     # There is a blank in the left column to cause the pod formatter to
14761     # output the line as-is.
14762     return sprintf " %-*s%-*s %s\n",
14763                     # The first * in the format is replaced by this, the -1 is
14764                     # to account for the leading blank.  There isn't a
14765                     # hard-coded blank after this to separate the flags from
14766                     # the rest of the line, so that in the unlikely event that
14767                     # multiple flags are shown on the same line, they both
14768                     # will get displayed at the expense of that separation,
14769                     # but since they are left justified, a blank will be
14770                     # inserted in the normal case.
14771                     $FILLER - 1,
14772                     $flags,
14773
14774                     # The other * in the format is replaced by this number to
14775                     # cause the first main column to right fill with blanks.
14776                     # The -1 is for the guaranteed blank following it.
14777                     $first_column_width - $FILLER - 1,
14778                     $entry,
14779                     $info;
14780 }
14781
14782 my @zero_match_tables;  # List of tables that have no matches in this release
14783
14784 sub make_re_pod_entries($) {
14785     # This generates the entries for the pod file for a given table.
14786     # Also done at this time are any children tables.  The output looks like:
14787     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
14788
14789     my $input_table = shift;        # Table the entry is for
14790     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14791
14792     # Generate parent and all its children at the same time.
14793     return if $input_table->parent != $input_table;
14794
14795     my $property = $input_table->property;
14796     my $type = $property->type;
14797     my $full_name = $property->full_name;
14798
14799     my $count = $input_table->count;
14800     my $string_count = clarify_number($count);
14801     my $status = $input_table->status;
14802     my $status_info = $input_table->status_info;
14803     my $caseless_equivalent = $input_table->caseless_equivalent;
14804
14805     # Don't mention a placeholder equivalent as it isn't to be listed in the
14806     # pod
14807     $caseless_equivalent = 0 if $caseless_equivalent != 0
14808                                 && $caseless_equivalent->fate > $ORDINARY;
14809
14810     my $entry_for_first_table; # The entry for the first table output.
14811                            # Almost certainly, it is the parent.
14812
14813     # For each related table (including itself), we will generate a pod entry
14814     # for each name each table goes by
14815     foreach my $table ($input_table, $input_table->children) {
14816
14817         # utf8_heavy.pl cannot deal with null string property values, so skip
14818         # any tables that have no non-null names.
14819         next if ! grep { $_->name ne "" } $table->aliases;
14820
14821         # First, gather all the info that applies to this table as a whole.
14822
14823         push @zero_match_tables, $table if $count == 0
14824                                             # Don't mention special tables
14825                                             # as being zero length
14826                                            && $table->fate == $ORDINARY;
14827
14828         my $table_property = $table->property;
14829
14830         # The short name has all the underscores removed, while the full name
14831         # retains them.  Later, we decide whether to output a short synonym
14832         # for the full one, we need to compare apples to apples, so we use the
14833         # short name's length including underscores.
14834         my $table_property_short_name_length;
14835         my $table_property_short_name
14836             = $table_property->short_name(\$table_property_short_name_length);
14837         my $table_property_full_name = $table_property->full_name;
14838
14839         # Get how much savings there is in the short name over the full one
14840         # (delta will always be <= 0)
14841         my $table_property_short_delta = $table_property_short_name_length
14842                                          - length($table_property_full_name);
14843         my @table_description = $table->description;
14844         my @table_note = $table->note;
14845
14846         # Generate an entry for each alias in this table.
14847         my $entry_for_first_alias;  # saves the first one encountered.
14848         foreach my $alias ($table->aliases) {
14849
14850             # Skip if not to go in pod.
14851             next unless $alias->make_re_pod_entry;
14852
14853             # Start gathering all the components for the entry
14854             my $name = $alias->name;
14855
14856             # Skip if name is empty, as can't be accessed by regexes.
14857             next if $name eq "";
14858
14859             my $entry;      # Holds the left column, may include extras
14860             my $entry_ref;  # To refer to the left column's contents from
14861                             # another entry; has no extras
14862
14863             # First the left column of the pod entry.  Tables for the $perl
14864             # property always use the single form.
14865             if ($table_property == $perl) {
14866                 $entry = "\\p{$name}";
14867                 $entry_ref = "\\p{$name}";
14868             }
14869             else {    # Compound form.
14870
14871                 # Only generate one entry for all the aliases that mean true
14872                 # or false in binary properties.  Append a '*' to indicate
14873                 # some are missing.  (The heading comment notes this.)
14874                 my $rhs;
14875                 if ($type == $BINARY) {
14876                     next if $name ne 'N' && $name ne 'Y';
14877                     $rhs = "$name*";
14878                 }
14879                 elsif ($type != $FORCED_BINARY) {
14880                     $rhs = $name;
14881                 }
14882                 else {
14883
14884                     # Forced binary properties require special handling.  It
14885                     # has two sets of tables, one set is true/false; and the
14886                     # other set is everything else.  Entries are generated for
14887                     # each set.  Use the Bidi_Mirrored property (which appears
14888                     # in all Unicode versions) to get a list of the aliases
14889                     # for the true/false tables.  Of these, only output the N
14890                     # and Y ones, the same as, a regular binary property.  And
14891                     # output all the rest, same as a non-binary property.
14892                     my $bm = property_ref("Bidi_Mirrored");
14893                     if ($name eq 'N' || $name eq 'Y') {
14894                         $rhs = "$name*";
14895                     } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
14896                                                         $bm->table("N")->aliases)
14897                     {
14898                         next;
14899                     }
14900                     else {
14901                         $rhs = $name;
14902                     }
14903                 }
14904
14905                 # Colon-space is used to give a little more space to be easier
14906                 # to read;
14907                 $entry = "\\p{"
14908                         . $table_property_full_name
14909                         . ": $rhs}";
14910
14911                 # But for the reference to this entry, which will go in the
14912                 # right column, where space is at a premium, use equals
14913                 # without a space
14914                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
14915             }
14916
14917             # Then the right (info) column.  This is stored as components of
14918             # an array for the moment, then joined into a string later.  For
14919             # non-internal only properties, begin the info with the entry for
14920             # the first table we encountered (if any), as things are ordered
14921             # so that that one is the most descriptive.  This leads to the
14922             # info column of an entry being a more descriptive version of the
14923             # name column
14924             my @info;
14925             if ($name =~ /^_/) {
14926                 push @info,
14927                         '(For internal use by Perl, not necessarily stable)';
14928             }
14929             elsif ($entry_for_first_alias) {
14930                 push @info, $entry_for_first_alias;
14931             }
14932
14933             # If this entry is equivalent to another, add that to the info,
14934             # using the first such table we encountered
14935             if ($entry_for_first_table) {
14936                 if (@info) {
14937                     push @info, "(= $entry_for_first_table)";
14938                 }
14939                 else {
14940                     push @info, $entry_for_first_table;
14941                 }
14942             }
14943
14944             # If the name is a large integer, add an equivalent with an
14945             # exponent for better readability
14946             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
14947                 push @info, sprintf "(= %.1e)", $name
14948             }
14949
14950             my $parenthesized = "";
14951             if (! $entry_for_first_alias) {
14952
14953                 # This is the first alias for the current table.  The alias
14954                 # array is ordered so that this is the fullest, most
14955                 # descriptive alias, so it gets the fullest info.  The other
14956                 # aliases are mostly merely pointers to this one, using the
14957                 # information already added above.
14958
14959                 # Display any status message, but only on the parent table
14960                 if ($status && ! $entry_for_first_table) {
14961                     push @info, $status_info;
14962                 }
14963
14964                 # Put out any descriptive info
14965                 if (@table_description || @table_note) {
14966                     push @info, join "; ", @table_description, @table_note;
14967                 }
14968
14969                 # Look to see if there is a shorter name we can point people
14970                 # at
14971                 my $standard_name = standardize($name);
14972                 my $short_name;
14973                 my $proposed_short = $table->short_name;
14974                 if (defined $proposed_short) {
14975                     my $standard_short = standardize($proposed_short);
14976
14977                     # If the short name is shorter than the standard one, or
14978                     # even it it's not, but the combination of it and its
14979                     # short property name (as in \p{prop=short} ($perl doesn't
14980                     # have this form)) saves at least two characters, then,
14981                     # cause it to be listed as a shorter synonym.
14982                     if (length $standard_short < length $standard_name
14983                         || ($table_property != $perl
14984                             && (length($standard_short)
14985                                 - length($standard_name)
14986                                 + $table_property_short_delta)  # (<= 0)
14987                                 < -2))
14988                     {
14989                         $short_name = $proposed_short;
14990                         if ($table_property != $perl) {
14991                             $short_name = $table_property_short_name
14992                                           . "=$short_name";
14993                         }
14994                         $short_name = "\\p{$short_name}";
14995                     }
14996                 }
14997
14998                 # And if this is a compound form name, see if there is a
14999                 # single form equivalent
15000                 my $single_form;
15001                 if ($table_property != $perl) {
15002
15003                     # Special case the binary N tables, so that will print
15004                     # \P{single}, but use the Y table values to populate
15005                     # 'single', as we haven't likewise populated the N table.
15006                     # For forced binary tables, we can't just look at the N
15007                     # table, but must see if this table is equivalent to the N
15008                     # one, as there are two equivalent beasts in these
15009                     # properties.
15010                     my $test_table;
15011                     my $p;
15012                     if (   ($type == $BINARY
15013                             && $input_table == $property->table('No'))
15014                         || ($type == $FORCED_BINARY
15015                             && $property->table('No')->
15016                                         is_set_equivalent_to($input_table)))
15017                     {
15018                         $test_table = $property->table('Yes');
15019                         $p = 'P';
15020                     }
15021                     else {
15022                         $test_table = $input_table;
15023                         $p = 'p';
15024                     }
15025
15026                     # Look for a single form amongst all the children.
15027                     foreach my $table ($test_table->children) {
15028                         next if $table->property != $perl;
15029                         my $proposed_name = $table->short_name;
15030                         next if ! defined $proposed_name;
15031
15032                         # Don't mention internal-only properties as a possible
15033                         # single form synonym
15034                         next if substr($proposed_name, 0, 1) eq '_';
15035
15036                         $proposed_name = "\\$p\{$proposed_name}";
15037                         if (! defined $single_form
15038                             || length($proposed_name) < length $single_form)
15039                         {
15040                             $single_form = $proposed_name;
15041
15042                             # The goal here is to find a single form; not the
15043                             # shortest possible one.  We've already found a
15044                             # short name.  So, stop at the first single form
15045                             # found, which is likely to be closer to the
15046                             # original.
15047                             last;
15048                         }
15049                     }
15050                 }
15051
15052                 # Ouput both short and single in the same parenthesized
15053                 # expression, but with only one of 'Single', 'Short' if there
15054                 # are both items.
15055                 if ($short_name || $single_form || $table->conflicting) {
15056                     $parenthesized .= "Short: $short_name" if $short_name;
15057                     if ($short_name && $single_form) {
15058                         $parenthesized .= ', ';
15059                     }
15060                     elsif ($single_form) {
15061                         $parenthesized .= 'Single: ';
15062                     }
15063                     $parenthesized .= $single_form if $single_form;
15064                 }
15065             }
15066
15067             if ($caseless_equivalent != 0) {
15068                 $parenthesized .=  '; ' if $parenthesized ne "";
15069                 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
15070             }
15071
15072
15073             # Warn if this property isn't the same as one that a
15074             # semi-casual user might expect.  The other components of this
15075             # parenthesized structure are calculated only for the first entry
15076             # for this table, but the conflicting is deemed important enough
15077             # to go on every entry.
15078             my $conflicting = join " NOR ", $table->conflicting;
15079             if ($conflicting) {
15080                 $parenthesized .=  '; ' if $parenthesized ne "";
15081                 $parenthesized .= "NOT $conflicting";
15082             }
15083
15084             push @info, "($parenthesized)" if $parenthesized;
15085
15086             if ($name =~ /_$/ && $alias->loose_match) {
15087                 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
15088             }
15089
15090             if ($table_property != $perl && $table->perl_extension) {
15091                 push @info, '(Perl extension)';
15092             }
15093             push @info, "($string_count)";
15094
15095             # Now, we have both the entry and info so add them to the
15096             # list of all the properties.
15097             push @match_properties,
15098                 format_pod_line($indent_info_column,
15099                                 $entry,
15100                                 join( " ", @info),
15101                                 $alias->status,
15102                                 $alias->loose_match);
15103
15104             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
15105         } # End of looping through the aliases for this table.
15106
15107         if (! $entry_for_first_table) {
15108             $entry_for_first_table = $entry_for_first_alias;
15109         }
15110     } # End of looping through all the related tables
15111     return;
15112 }
15113
15114 sub make_ucd_table_pod_entries {
15115     my $table = shift;
15116
15117     # Generate the entries for the UCD section of the pod for $table.  This
15118     # also calculates if names are ambiguous, so has to be called even if the
15119     # pod is not being output
15120
15121     my $short_name = $table->name;
15122     my $standard_short_name = standardize($short_name);
15123     my $full_name = $table->full_name;
15124     my $standard_full_name = standardize($full_name);
15125
15126     my $full_info = "";     # Text of info column for full-name entries
15127     my $other_info = "";    # Text of info column for short-name entries
15128     my $short_info = "";    # Text of info column for other entries
15129     my $meaning = "";       # Synonym of this table
15130
15131     my $property = ($table->isa('Property'))
15132                    ? $table
15133                    : $table->parent->property;
15134
15135     my $perl_extension = $table->perl_extension;
15136
15137     # Get the more official name for for perl extensions that aren't
15138     # stand-alone properties
15139     if ($perl_extension && $property != $table) {
15140         if ($property == $perl ||$property->type == $BINARY) {
15141             $meaning = $table->complete_name;
15142         }
15143         else {
15144             $meaning = $property->full_name . "=$full_name";
15145         }
15146     }
15147
15148     # There are three types of info column.  One for the short name, one for
15149     # the full name, and one for everything else.  They mostly are the same,
15150     # so initialize in the same loop.
15151     foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
15152         if ($perl_extension && $property != $table) {
15153
15154             # Add the synonymous name for the non-full name entries; and to
15155             # the full-name entry if it adds extra information
15156             if ($info_ref == \$other_info
15157                 || ($info_ref == \$short_info
15158                     && $standard_short_name ne $standard_full_name)
15159                 || standardize($meaning) ne $standard_full_name
15160             ) {
15161                 $$info_ref .= "$meaning.";
15162             }
15163         }
15164         elsif ($info_ref != \$full_info) {
15165
15166             # Otherwise, the non-full name columns include the full name
15167             $$info_ref .= $full_name;
15168         }
15169
15170         # And the full-name entry includes the short name, if different
15171         if ($info_ref == \$full_info
15172             && $standard_short_name ne $standard_full_name)
15173         {
15174             $full_info =~ s/\.\Z//;
15175             $full_info .= "  " if $full_info;
15176             $full_info .= "(Short: $short_name)";
15177         }
15178
15179         if ($table->perl_extension) {
15180             $$info_ref =~ s/\.\Z//;
15181             $$info_ref .= ".  " if $$info_ref;
15182             $$info_ref .= "(Perl extension)";
15183         }
15184     }
15185
15186     # Add any extra annotations to the full name entry
15187     foreach my $more_info ($table->description,
15188                             $table->note,
15189                             $table->status_info)
15190     {
15191         next unless $more_info;
15192         $full_info =~ s/\.\Z//;
15193         $full_info .= ".  " if $full_info;
15194         $full_info .= $more_info;
15195     }
15196
15197     # These keep track if have created full and short name pod entries for the
15198     # property
15199     my $done_full = 0;
15200     my $done_short = 0;
15201
15202     # Every possible name is kept track of, even those that aren't going to be
15203     # output.  This way we can be sure to find the ambiguities.
15204     foreach my $alias ($table->aliases) {
15205         my $name = $alias->name;
15206         my $standard = standardize($name);
15207         my $info;
15208         my $output_this = $alias->ucd;
15209
15210         # If the full and short names are the same, we want to output the full
15211         # one's entry, so it has priority.
15212         if ($standard eq $standard_full_name) {
15213             next if $done_full;
15214             $done_full = 1;
15215             $info = $full_info;
15216         }
15217         elsif ($standard eq $standard_short_name) {
15218             next if $done_short;
15219             $done_short = 1;
15220             next if $standard_short_name eq $standard_full_name;
15221             $info = $short_info;
15222         }
15223         else {
15224             $info = $other_info;
15225         }
15226
15227         # Here, we have set up the two columns for this entry.  But if an
15228         # entry already exists for this name, we have to decide which one
15229         # we're going to later output.
15230         if (exists $ucd_pod{$standard}) {
15231
15232             # If the two entries refer to the same property, it's not going to
15233             # be ambiguous.  (Likely it's because the names when standardized
15234             # are the same.)  But that means if they are different properties,
15235             # there is ambiguity.
15236             if ($ucd_pod{$standard}->{'property'} != $property) {
15237
15238                 # Here, we have an ambiguity.  This code assumes that one is
15239                 # scheduled to be output and one not and that one is a perl
15240                 # extension (which is not to be output) and the other isn't.
15241                 # If those assumptions are wrong, things have to be rethought.
15242                 if ($ucd_pod{$standard}{'output_this'} == $output_this
15243                     || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
15244                     || $output_this == $perl_extension)
15245                 {
15246                     Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
15247                 }
15248
15249                 # We modifiy the info column of the one being output to
15250                 # indicate the ambiguity.  Set $which to point to that one's
15251                 # info.
15252                 my $which;
15253                 if ($ucd_pod{$standard}{'output_this'}) {
15254                     $which = \$ucd_pod{$standard}->{'info'};
15255                 }
15256                 else {
15257                     $which = \$info;
15258                     $meaning = $ucd_pod{$standard}{'meaning'};
15259                 }
15260
15261                 chomp $$which;
15262                 $$which =~ s/\.\Z//;
15263                 $$which .= "; NOT '$standard' meaning '$meaning'";
15264
15265                 $ambiguous_names{$standard} = 1;
15266             }
15267
15268             # Use the non-perl-extension variant
15269             next unless $ucd_pod{$standard}{'perl_extension'};
15270         }
15271
15272         # Store enough information about this entry that we can later look for
15273         # ambiguities, and output it properly.
15274         $ucd_pod{$standard} = { 'name' => $name,
15275                                 'info' => $info,
15276                                 'meaning' => $meaning,
15277                                 'output_this' => $output_this,
15278                                 'perl_extension' => $perl_extension,
15279                                 'property' => $property,
15280                                 'status' => $alias->status,
15281         };
15282     } # End of looping through all this table's aliases
15283
15284     return;
15285 }
15286
15287 sub pod_alphanumeric_sort {
15288     # Sort pod entries alphanumerically.
15289
15290     # The first few character columns are filler, plus the '\p{'; and get rid
15291     # of all the trailing stuff, starting with the trailing '}', so as to sort
15292     # on just 'Name=Value'
15293     (my $a = lc $a) =~ s/^ .*? { //x;
15294     $a =~ s/}.*//;
15295     (my $b = lc $b) =~ s/^ .*? { //x;
15296     $b =~ s/}.*//;
15297
15298     # Determine if the two operands are both internal only or both not.
15299     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
15300     # should be the underscore that begins internal only
15301     my $a_is_internal = (substr($a, 0, 1) eq '_');
15302     my $b_is_internal = (substr($b, 0, 1) eq '_');
15303
15304     # Sort so the internals come last in the table instead of first (which the
15305     # leading underscore would otherwise indicate).
15306     if ($a_is_internal != $b_is_internal) {
15307         return 1 if $a_is_internal;
15308         return -1
15309     }
15310
15311     # Determine if the two operands are numeric property values or not.
15312     # A numeric property will look like xyz: 3.  But the number
15313     # can begin with an optional minus sign, and may have a
15314     # fraction or rational component, like xyz: 3/2.  If either
15315     # isn't numeric, use alphabetic sort.
15316     my ($a_initial, $a_number) =
15317         ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15318     return $a cmp $b unless defined $a_number;
15319     my ($b_initial, $b_number) =
15320         ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15321     return $a cmp $b unless defined $b_number;
15322
15323     # Here they are both numeric, but use alphabetic sort if the
15324     # initial parts don't match
15325     return $a cmp $b if $a_initial ne $b_initial;
15326
15327     # Convert rationals to floating for the comparison.
15328     $a_number = eval $a_number if $a_number =~ qr{/};
15329     $b_number = eval $b_number if $b_number =~ qr{/};
15330
15331     return $a_number <=> $b_number;
15332 }
15333
15334 sub make_pod () {
15335     # Create the .pod file.  This generates the various subsections and then
15336     # combines them in one big HERE document.
15337
15338     my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
15339
15340     return unless defined $pod_directory;
15341     print "Making pod file\n" if $verbosity >= $PROGRESS;
15342
15343     my $exception_message =
15344     '(Any exceptions are individually noted beginning with the word NOT.)';
15345     my @block_warning;
15346     if (-e 'Blocks.txt') {
15347
15348         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
15349         # if the global $has_In_conflicts indicates we have them.
15350         push @match_properties, format_pod_line($indent_info_column,
15351                                                 '\p{In_*}',
15352                                                 '\p{Block: *}'
15353                                                     . (($has_In_conflicts)
15354                                                       ? " $exception_message"
15355                                                       : ""));
15356         @block_warning = << "END";
15357
15358 Matches in the Block property have shortcuts that begin with "In_".  For
15359 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>.  For
15360 backward compatibility, if there is no conflict with another shortcut, these
15361 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>.  But, N.B., there
15362 are numerous such conflicting shortcuts.  Use of these forms for Block is
15363 discouraged, and are flagged as such, not only because of the potential
15364 confusion as to what is meant, but also because a later release of Unicode may
15365 preempt the shortcut, and your program would no longer be correct.  Use the
15366 "In_" form instead to avoid this, or even more clearly, use the compound form,
15367 e.g., C<\\p{blk:latin1}>.  See L<perlunicode/"Blocks"> for more information
15368 about this.
15369 END
15370     }
15371     my $text = $Is_flags_text;
15372     $text = "$exception_message $text" if $has_Is_conflicts;
15373
15374     # And the 'Is_ line';
15375     push @match_properties, format_pod_line($indent_info_column,
15376                                             '\p{Is_*}',
15377                                             "\\p{*} $text");
15378
15379     # Sort the properties array for output.  It is sorted alphabetically
15380     # except numerically for numeric properties, and only output unique lines.
15381     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
15382
15383     my $formatted_properties = simple_fold(\@match_properties,
15384                                         "",
15385                                         # indent succeeding lines by two extra
15386                                         # which looks better
15387                                         $indent_info_column + 2,
15388
15389                                         # shorten the line length by how much
15390                                         # the formatter indents, so the folded
15391                                         # line will fit in the space
15392                                         # presumably available
15393                                         $automatic_pod_indent);
15394     # Add column headings, indented to be a little more centered, but not
15395     # exactly
15396     $formatted_properties =  format_pod_line($indent_info_column,
15397                                                     '    NAME',
15398                                                     '           INFO')
15399                                     . "\n"
15400                                     . $formatted_properties;
15401
15402     # Generate pod documentation lines for the tables that match nothing
15403     my $zero_matches = "";
15404     if (@zero_match_tables) {
15405         @zero_match_tables = uniques(@zero_match_tables);
15406         $zero_matches = join "\n\n",
15407                         map { $_ = '=item \p{' . $_->complete_name . "}" }
15408                             sort { $a->complete_name cmp $b->complete_name }
15409                             @zero_match_tables;
15410
15411         $zero_matches = <<END;
15412
15413 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
15414
15415 Unicode has some property-value pairs that currently don't match anything.
15416 This happens generally either because they are obsolete, or they exist for
15417 symmetry with other forms, but no language has yet been encoded that uses
15418 them.  In this version of Unicode, the following match zero code points:
15419
15420 =over 4
15421
15422 $zero_matches
15423
15424 =back
15425
15426 END
15427     }
15428
15429     # Generate list of properties that we don't accept, grouped by the reasons
15430     # why.  This is so only put out the 'why' once, and then list all the
15431     # properties that have that reason under it.
15432
15433     my %why_list;   # The keys are the reasons; the values are lists of
15434                     # properties that have the key as their reason
15435
15436     # For each property, add it to the list that are suppressed for its reason
15437     # The sort will cause the alphabetically first properties to be added to
15438     # each list first, so each list will be sorted.
15439     foreach my $property (sort keys %why_suppressed) {
15440         push @{$why_list{$why_suppressed{$property}}}, $property;
15441     }
15442
15443     # For each reason (sorted by the first property that has that reason)...
15444     my @bad_re_properties;
15445     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
15446                      keys %why_list)
15447     {
15448         # Add to the output, all the properties that have that reason.
15449         my $has_item = 0;   # Flag if actually output anything.
15450         foreach my $name (@{$why_list{$why}}) {
15451
15452             # Split compound names into $property and $table components
15453             my $property = $name;
15454             my $table;
15455             if ($property =~ / (.*) = (.*) /x) {
15456                 $property = $1;
15457                 $table = $2;
15458             }
15459
15460             # This release of Unicode may not have a property that is
15461             # suppressed, so don't reference a non-existent one.
15462             $property = property_ref($property);
15463             next if ! defined $property;
15464
15465             # And since this list is only for match tables, don't list the
15466             # ones that don't have match tables.
15467             next if ! $property->to_create_match_tables;
15468
15469             # Find any abbreviation, and turn it into a compound name if this
15470             # is a property=value pair.
15471             my $short_name = $property->name;
15472             $short_name .= '=' . $property->table($table)->name if $table;
15473
15474             # Start with an empty line.
15475             push @bad_re_properties, "\n\n" unless $has_item;
15476
15477             # And add the property as an item for the reason.
15478             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
15479             $has_item = 1;
15480         }
15481
15482         # And add the reason under the list of properties, if such a list
15483         # actually got generated.  Note that the header got added
15484         # unconditionally before.  But pod ignores extra blank lines, so no
15485         # harm.
15486         push @bad_re_properties, "\n$why\n" if $has_item;
15487
15488     } # End of looping through each reason.
15489
15490     if (! @bad_re_properties) {
15491         push @bad_re_properties,
15492                 "*** This installation accepts ALL non-Unihan properties ***";
15493     }
15494     else {
15495         # Add =over only if non-empty to avoid an empty =over/=back section,
15496         # which is considered bad form.
15497         unshift @bad_re_properties, "\n=over 4\n";
15498         push @bad_re_properties, "\n=back\n";
15499     }
15500
15501     # Similiarly, generate a list of files that we don't use, grouped by the
15502     # reasons why.  First, create a hash whose keys are the reasons, and whose
15503     # values are anonymous arrays of all the files that share that reason.
15504     my %grouped_by_reason;
15505     foreach my $file (keys %ignored_files) {
15506         push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
15507     }
15508     foreach my $file (keys %skipped_files) {
15509         push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
15510     }
15511
15512     # Then, sort each group.
15513     foreach my $group (keys %grouped_by_reason) {
15514         @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
15515                                         @{$grouped_by_reason{$group}} ;
15516     }
15517
15518     # Finally, create the output text.  For each reason (sorted by the
15519     # alphabetically first file that has that reason)...
15520     my @unused_files;
15521     foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
15522                                cmp lc $grouped_by_reason{$b}->[0]
15523                               }
15524                          keys %grouped_by_reason)
15525     {
15526         # Add all the files that have that reason to the output.  Start
15527         # with an empty line.
15528         push @unused_files, "\n\n";
15529         push @unused_files, map { "\n=item F<$_> \n" }
15530                             @{$grouped_by_reason{$reason}};
15531         # And add the reason under the list of files
15532         push @unused_files, "\n$reason\n";
15533     }
15534
15535     # Similarly, create the output text for the UCD section of the pod
15536     my @ucd_pod;
15537     foreach my $key (keys %ucd_pod) {
15538         next unless $ucd_pod{$key}->{'output_this'};
15539         push @ucd_pod, format_pod_line($indent_info_column,
15540                                        $ucd_pod{$key}->{'name'},
15541                                        $ucd_pod{$key}->{'info'},
15542                                        $ucd_pod{$key}->{'status'},
15543                                       );
15544     }
15545
15546     # Sort alphabetically, and fold for output
15547     @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
15548     my $ucd_pod = simple_fold(\@ucd_pod,
15549                            ' ',
15550                            $indent_info_column,
15551                            $automatic_pod_indent);
15552     $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
15553                 . "\n"
15554                 . $ucd_pod;
15555     local $" = "";
15556
15557     # Everything is ready to assemble.
15558     my @OUT = << "END";
15559 =begin comment
15560
15561 $HEADER
15562
15563 To change this file, edit $0 instead.
15564
15565 =end comment
15566
15567 =head1 NAME
15568
15569 $pod_file - Index of Unicode Version $string_version character properties in Perl
15570
15571 =head1 DESCRIPTION
15572
15573 This document provides information about the portion of the Unicode database
15574 that deals with character properties, that is the portion that is defined on
15575 single code points.  (L</Other information in the Unicode data base>
15576 below briefly mentions other data that Unicode provides.)
15577
15578 Perl can provide access to all non-provisional Unicode character properties,
15579 though not all are enabled by default.  The omitted ones are the Unihan
15580 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
15581 deprecated or Unicode-internal properties.  (An installation may choose to
15582 recompile Perl's tables to change this.  See L<Unicode character
15583 properties that are NOT accepted by Perl>.)
15584
15585 For most purposes, access to Unicode properties from the Perl core is through
15586 regular expression matches, as described in the next section.
15587 For some special purposes, and to access the properties that are not suitable
15588 for regular expression matching, all the Unicode character properties that
15589 Perl handles are accessible via the standard L<Unicode::UCD> module, as
15590 described in the section L</Properties accessible through Unicode::UCD>.
15591
15592 Perl also provides some additional extensions and short-cut synonyms
15593 for Unicode properties.
15594
15595 This document merely lists all available properties and does not attempt to
15596 explain what each property really means.  There is a brief description of each
15597 Perl extension; see L<perlunicode/Other Properties> for more information on
15598 these.  There is some detail about Blocks, Scripts, General_Category,
15599 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
15600 official Unicode properties, refer to the Unicode standard.  A good starting
15601 place is L<$unicode_reference_url>.
15602
15603 Note that you can define your own properties; see
15604 L<perlunicode/"User-Defined Character Properties">.
15605
15606 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
15607
15608 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
15609 most of the Unicode character properties.  The table below shows all these
15610 constructs, both single and compound forms.
15611
15612 B<Compound forms> consist of two components, separated by an equals sign or a
15613 colon.  The first component is the property name, and the second component is
15614 the particular value of the property to match against, for example,
15615 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
15616 whose Script property value is Greek.
15617
15618 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
15619 their equivalent compound forms.  The table shows these equivalences.  (In our
15620 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
15621 There are also a few Perl-defined single forms that are not shortcuts for a
15622 compound form.  One such is C<\\p{Word}>.  These are also listed in the table.
15623
15624 In parsing these constructs, Perl always ignores Upper/lower case differences
15625 everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
15626 C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
15627 the left brace completely changes the meaning of the construct, from "match"
15628 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
15629 for improved legibility.
15630
15631 Also, white space, hyphens, and underscores are normally ignored
15632 everywhere between the {braces}, and hence can be freely added or removed
15633 even if the C</x> modifier hasn't been specified on the regular expression.
15634 But in the table below $a_bold_stricter at the beginning of an entry
15635 means that tighter (stricter) rules are used for that entry:
15636
15637 =over 4
15638
15639 =over 4
15640
15641 =item Single form (C<\\p{name}>) tighter rules:
15642
15643 White space, hyphens, and underscores ARE significant
15644 except for:
15645
15646 =over 4
15647
15648 =item * white space adjacent to a non-word character
15649
15650 =item * underscores separating digits in numbers
15651
15652 =back
15653
15654 That means, for example, that you can freely add or remove white space
15655 adjacent to (but within) the braces without affecting the meaning.
15656
15657 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
15658
15659 The tighter rules given above for the single form apply to everything to the
15660 right of the colon or equals; the looser rules still apply to everything to
15661 the left.
15662
15663 That means, for example, that you can freely add or remove white space
15664 adjacent to (but within) the braces and the colon or equal sign.
15665
15666 =back
15667
15668 =back
15669
15670 Some properties are considered obsolete by Unicode, but still available.
15671 There are several varieties of obsolescence:
15672
15673 =over 4
15674
15675 =over 4
15676
15677 =item Stabilized
15678
15679 A property may be stabilized.  Such a determination does not indicate
15680 that the property should or should not be used; instead it is a declaration
15681 that the property will not be maintained nor extended for newly encoded
15682 characters.  Such properties are marked with $a_bold_stabilized in the
15683 table.
15684
15685 =item Deprecated
15686
15687 A property may be deprecated, perhaps because its original intent
15688 has been replaced by another property, or because its specification was
15689 somehow defective.  This means that its use is strongly
15690 discouraged, so much so that a warning will be issued if used, unless the
15691 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
15692 statement.  $A_bold_deprecated flags each such entry in the table, and
15693 the entry there for the longest, most descriptive version of the property will
15694 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
15695 warning, even for properties that aren't officially deprecated by Unicode,
15696 when there used to be characters or code points that were matched by them, but
15697 no longer.  This is to warn you that your program may not work like it did on
15698 earlier Unicode releases.
15699
15700 A deprecated property may be made unavailable in a future Perl version, so it
15701 is best to move away from them.
15702
15703 A deprecated property may also be stabilized, but this fact is not shown.
15704
15705 =item Obsolete
15706
15707 Properties marked with $a_bold_obsolete in the table are considered (plain)
15708 obsolete.  Generally this designation is given to properties that Unicode once
15709 used for internal purposes (but not any longer).
15710
15711 =back
15712
15713 Some Perl extensions are present for backwards compatibility and are
15714 discouraged from being used, but are not obsolete.  $A_bold_discouraged
15715 flags each such entry in the table.  Future Unicode versions may force
15716 some of these extensions to be removed without warning, replaced by another
15717 property with the same name that means something different.  Use the
15718 equivalent shown instead.
15719
15720 =back
15721
15722 @block_warning
15723
15724 The table below has two columns.  The left column contains the C<\\p{}>
15725 constructs to look up, possibly preceded by the flags mentioned above; and
15726 the right column contains information about them, like a description, or
15727 synonyms.  The table shows both the single and compound forms for each
15728 property that has them.  If the left column is a short name for a property,
15729 the right column will give its longer, more descriptive name; and if the left
15730 column is the longest name, the right column will show any equivalent shortest
15731 name, in both single and compound forms if applicable.
15732
15733 The right column will also caution you if a property means something different
15734 than what might normally be expected.
15735
15736 All single forms are Perl extensions; a few compound forms are as well, and
15737 are noted as such.
15738
15739 Numbers in (parentheses) indicate the total number of Unicode code points
15740 matched by the property.  For emphasis, those properties that match no code
15741 points at all are listed as well in a separate section following the table.
15742
15743 Most properties match the same code points regardless of whether C<"/i">
15744 case-insensitive matching is specified or not.  But a few properties are
15745 affected.  These are shown with the notation S<C<(/i= I<other_property>)>>
15746 in the second column.  Under case-insensitive matching they match the
15747 same code pode points as the property I<other_property>.
15748
15749 There is no description given for most non-Perl defined properties (See
15750 L<$unicode_reference_url> for that).
15751
15752 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
15753 combinations.  For example, entries like:
15754
15755  \\p{Gc: *}                                  \\p{General_Category: *}
15756
15757 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
15758 for the latter is also valid for the former.  Similarly,
15759
15760  \\p{Is_*}                                   \\p{*}
15761
15762 means that if and only if, for example, C<\\p{Foo}> exists, then
15763 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
15764 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
15765 C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
15766 underscore.
15767
15768 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
15769 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
15770 'N*' to indicate this, and doesn't have separate entries for the other
15771 possibilities.  Note that not all properties which have values 'Yes' and 'No'
15772 are binary, and they have all their values spelled out without using this wild
15773 card, and a C<NOT> clause in their description that highlights their not being
15774 binary.  These also require the compound form to match them, whereas true
15775 binary properties have both single and compound forms available.
15776
15777 Note that all non-essential underscores are removed in the display of the
15778 short names below.
15779
15780 B<Legend summary:>
15781
15782 =over 4
15783
15784 =item *
15785
15786 B<*> is a wild-card
15787
15788 =item *
15789
15790 B<(\\d+)> in the info column gives the number of Unicode code points matched
15791 by this property.
15792
15793 =item *
15794
15795 B<$DEPRECATED> means this is deprecated.
15796
15797 =item *
15798
15799 B<$OBSOLETE> means this is obsolete.
15800
15801 =item *
15802
15803 B<$STABILIZED> means this is stabilized.
15804
15805 =item *
15806
15807 B<$STRICTER> means tighter (stricter) name matching applies.
15808
15809 =item *
15810
15811 B<$DISCOURAGED> means use of this form is discouraged, and may not be
15812 stable.
15813
15814 =back
15815
15816 $formatted_properties
15817
15818 $zero_matches
15819
15820 =head1 Properties accessible through Unicode::UCD
15821
15822 All the Unicode character properties mentioned above (except for those marked
15823 as for internal use by Perl) are also accessible by
15824 L<Unicode::UCD/prop_invlist()>.
15825
15826 Due to their nature, not all Unicode character properties are suitable for
15827 regular expression matches, nor C<prop_invlist()>.  The remaining
15828 non-provisional, non-internal ones are accessible via
15829 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
15830 hasn't included; see L<below for which those are|/Unicode character properties
15831 that are NOT accepted by Perl>).
15832
15833 For compatibility with other parts of Perl, all the single forms given in the
15834 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
15835 are recognized.  BUT, there are some ambiguities between some Perl extensions
15836 and the Unicode properties, all of which are silently resolved in favor of the
15837 official Unicode property.  To avoid surprises, you should only use
15838 C<prop_invmap()> for forms listed in the table below, which omits the
15839 non-recommended ones.  The affected forms are the Perl single form equivalents
15840 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
15841 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
15842 whose short name is C<sc>.  The table indicates the current ambiguities in the
15843 INFO column, beginning with the word C<"NOT">.
15844
15845 The standard Unicode properties listed below are documented in
15846 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
15847 L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
15848 L<perlunicode/Other Properties>;
15849
15850 The first column in the table is a name for the property; the second column is
15851 an alternative name, if any, plus possibly some annotations.  The alternative
15852 name is the property's full name, unless that would simply repeat the first
15853 column, in which case the second column indicates the property's short name
15854 (if different).  The annotations are given only in the entry for the full
15855 name.  If a property is obsolete, etc, the entry will be flagged with the same
15856 characters used in the table in the L<section above|/Properties accessible
15857 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
15858
15859 $ucd_pod
15860
15861 =head1 Properties accessible through other means
15862
15863 Certain properties are accessible also via core function calls.  These are:
15864
15865  Lowercase_Mapping          lc() and lcfirst()
15866  Titlecase_Mapping          ucfirst()
15867  Uppercase_Mapping          uc()
15868
15869 Also, Case_Folding is accessible through the C</i> modifier in regular
15870 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
15871 operator.
15872
15873 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
15874 interpolation in double-quoted strings and regular expressions; and functions
15875 C<charnames::viacode()>, C<charnames::vianame()>, and
15876 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
15877 specified.
15878
15879 Finally, most properties related to decomposition are accessible via
15880 L<Unicode::Normalize>.
15881
15882 =head1 Unicode character properties that are NOT accepted by Perl
15883
15884 Perl will generate an error for a few character properties in Unicode when
15885 used in a regular expression.  The non-Unihan ones are listed below, with the
15886 reasons they are not accepted, perhaps with work-arounds.  The short names for
15887 the properties are listed enclosed in (parentheses).
15888 As described after the list, an installation can change the defaults and choose
15889 to accept any of these.  The list is machine generated based on the
15890 choices made for the installation that generated this document.
15891
15892 @bad_re_properties
15893
15894 An installation can choose to allow any of these to be matched by downloading
15895 the Unicode database from L<http://www.unicode.org/Public/> to
15896 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
15897 controlling lists contained in the program
15898 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
15899 (C<\%Config> is available from the Config module).
15900
15901 =head1 Other information in the Unicode data base
15902
15903 The Unicode data base is delivered in two different formats.  The XML version
15904 is valid for more modern Unicode releases.  The other version is a collection
15905 of files.  The two are intended to give equivalent information.  Perl uses the
15906 older form; this allows you to recompile Perl to use early Unicode releases.
15907
15908 The only non-character property that Perl currently supports is Named
15909 Sequences, in which a sequence of code points
15910 is given a name and generally treated as a single entity.  (Perl supports
15911 these via the C<\\N{...}> double-quotish construct,
15912 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
15913
15914 Below is a list of the files in the Unicode data base that Perl doesn't
15915 currently use, along with very brief descriptions of their purposes.
15916 Some of the names of the files have been shortened from those that Unicode
15917 uses, in order to allow them to be distinguishable from similarly named files
15918 on file systems for which only the first 8 characters of a name are
15919 significant.
15920
15921 =over 4
15922
15923 @unused_files
15924
15925 =back
15926
15927 =head1 SEE ALSO
15928
15929 L<$unicode_reference_url>
15930
15931 L<perlrecharclass>
15932
15933 L<perlunicode>
15934
15935 END
15936
15937     # And write it.  The 0 means no utf8.
15938     main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
15939     return;
15940 }
15941
15942 sub make_Heavy () {
15943     # Create and write Heavy.pl, which passes info about the tables to
15944     # utf8_heavy.pl
15945
15946     # Stringify structures for output
15947     my $loose_property_name_of
15948                            = simple_dumper(\%loose_property_name_of, ' ' x 4);
15949     chomp $loose_property_name_of;
15950
15951     my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
15952     chomp $stricter_to_file_of;
15953
15954     my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
15955     chomp $loose_to_file_of;
15956
15957     my $nv_floating_to_rational
15958                            = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
15959     chomp $nv_floating_to_rational;
15960
15961     my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
15962     chomp $why_deprecated;
15963
15964     # We set the key to the file when we associated files with tables, but we
15965     # couldn't do the same for the value then, as we might not have the file
15966     # for the alternate table figured out at that time.
15967     foreach my $cased (keys %caseless_equivalent_to) {
15968         my @path = $caseless_equivalent_to{$cased}->file_path;
15969         my $path = join '/', @path[1, -1];
15970         $caseless_equivalent_to{$cased} = $path;
15971     }
15972     my $caseless_equivalent_to
15973                            = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
15974     chomp $caseless_equivalent_to;
15975
15976     my $loose_property_to_file_of
15977                         = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
15978     chomp $loose_property_to_file_of;
15979
15980     my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
15981     chomp $file_to_swash_name;
15982
15983     my @heavy = <<END;
15984 $HEADER
15985 $INTERNAL_ONLY_HEADER
15986
15987 # This file is for the use of utf8_heavy.pl and Unicode::UCD
15988
15989 # Maps Unicode (not Perl single-form extensions) property names in loose
15990 # standard form to their corresponding standard names
15991 \%utf8::loose_property_name_of = (
15992 $loose_property_name_of
15993 );
15994
15995 # Maps property, table to file for those using stricter matching
15996 \%utf8::stricter_to_file_of = (
15997 $stricter_to_file_of
15998 );
15999
16000 # Maps property, table to file for those using loose matching
16001 \%utf8::loose_to_file_of = (
16002 $loose_to_file_of
16003 );
16004
16005 # Maps floating point to fractional form
16006 \%utf8::nv_floating_to_rational = (
16007 $nv_floating_to_rational
16008 );
16009
16010 # If a floating point number doesn't have enough digits in it to get this
16011 # close to a fraction, it isn't considered to be that fraction even if all the
16012 # digits it does have match.
16013 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
16014
16015 # Deprecated tables to generate a warning for.  The key is the file containing
16016 # the table, so as to avoid duplication, as many property names can map to the
16017 # file, but we only need one entry for all of them.
16018 \%utf8::why_deprecated = (
16019 $why_deprecated
16020 );
16021
16022 # A few properties have different behavior under /i matching.  This maps
16023 # those to substitute files to use under /i.
16024 \%utf8::caseless_equivalent = (
16025 $caseless_equivalent_to
16026 );
16027
16028 # Property names to mapping files
16029 \%utf8::loose_property_to_file_of = (
16030 $loose_property_to_file_of
16031 );
16032
16033 # Files to the swash names within them.
16034 \%utf8::file_to_swash_name = (
16035 $file_to_swash_name
16036 );
16037
16038 1;
16039 END
16040
16041     main::write("Heavy.pl", 0, \@heavy);  # The 0 means no utf8.
16042     return;
16043 }
16044
16045 sub make_Name_pm () {
16046     # Create and write Name.pm, which contains subroutines and data to use in
16047     # conjunction with Name.pl
16048
16049     # Maybe there's nothing to do.
16050     return unless $has_hangul_syllables || @code_points_ending_in_code_point;
16051
16052     my @name = <<END;
16053 $HEADER
16054 $INTERNAL_ONLY_HEADER
16055 END
16056
16057     # Convert these structures to output format.
16058     my $code_points_ending_in_code_point =
16059         main::simple_dumper(\@code_points_ending_in_code_point,
16060                             ' ' x 8);
16061     my $names = main::simple_dumper(\%names_ending_in_code_point,
16062                                     ' ' x 8);
16063     my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
16064                                     ' ' x 8);
16065
16066     # Do the same with the Hangul names,
16067     my $jamo;
16068     my $jamo_l;
16069     my $jamo_v;
16070     my $jamo_t;
16071     my $jamo_re;
16072     if ($has_hangul_syllables) {
16073
16074         # Construct a regular expression of all the possible
16075         # combinations of the Hangul syllables.
16076         my @L_re;   # Leading consonants
16077         for my $i ($LBase .. $LBase + $LCount - 1) {
16078             push @L_re, $Jamo{$i}
16079         }
16080         my @V_re;   # Middle vowels
16081         for my $i ($VBase .. $VBase + $VCount - 1) {
16082             push @V_re, $Jamo{$i}
16083         }
16084         my @T_re;   # Trailing consonants
16085         for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
16086             push @T_re, $Jamo{$i}
16087         }
16088
16089         # The whole re is made up of the L V T combination.
16090         $jamo_re = '('
16091                     . join ('|', sort @L_re)
16092                     . ')('
16093                     . join ('|', sort @V_re)
16094                     . ')('
16095                     . join ('|', sort @T_re)
16096                     . ')?';
16097
16098         # These hashes needed by the algorithm were generated
16099         # during reading of the Jamo.txt file
16100         $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
16101         $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
16102         $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
16103         $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
16104     }
16105
16106     push @name, <<END;
16107
16108 package charnames;
16109
16110 # This module contains machine-generated tables and code for the
16111 # algorithmically-determinable Unicode character names.  The following
16112 # routines can be used to translate between name and code point and vice versa
16113
16114 { # Closure
16115
16116     # Matches legal code point.  4-6 hex numbers, If there are 6, the first
16117     # two must be 10; if there are 5, the first must not be a 0.  Written this
16118     # way to decrease backtracking.  The first regex allows the code point to
16119     # be at the end of a word, but to work properly, the word shouldn't end
16120     # with a valid hex character.  The second one won't match a code point at
16121     # the end of a word, and doesn't have the run-on issue
16122     my \$run_on_code_point_re = qr/$run_on_code_point_re/;
16123     my \$code_point_re = qr/$code_point_re/;
16124
16125     # In the following hash, the keys are the bases of names which include
16126     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
16127     # of each key is another hash which is used to get the low and high ends
16128     # for each range of code points that apply to the name.
16129     my %names_ending_in_code_point = (
16130 $names
16131     );
16132
16133     # The following hash is a copy of the previous one, except is for loose
16134     # matching, so each name has blanks and dashes squeezed out
16135     my %loose_names_ending_in_code_point = (
16136 $loose_names
16137     );
16138
16139     # And the following array gives the inverse mapping from code points to
16140     # names.  Lowest code points are first
16141     my \@code_points_ending_in_code_point = (
16142 $code_points_ending_in_code_point
16143     );
16144 END
16145     # Earlier releases didn't have Jamos.  No sense outputting
16146     # them unless will be used.
16147     if ($has_hangul_syllables) {
16148         push @name, <<END;
16149
16150     # Convert from code point to Jamo short name for use in composing Hangul
16151     # syllable names
16152     my %Jamo = (
16153 $jamo
16154     );
16155
16156     # Leading consonant (can be null)
16157     my %Jamo_L = (
16158 $jamo_l
16159     );
16160
16161     # Vowel
16162     my %Jamo_V = (
16163 $jamo_v
16164     );
16165
16166     # Optional trailing consonant
16167     my %Jamo_T = (
16168 $jamo_t
16169     );
16170
16171     # Computed re that splits up a Hangul name into LVT or LV syllables
16172     my \$syllable_re = qr/$jamo_re/;
16173
16174     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
16175     my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
16176
16177     # These constants names and values were taken from the Unicode standard,
16178     # version 5.1, section 3.12.  They are used in conjunction with Hangul
16179     # syllables
16180     my \$SBase = $SBase_string;
16181     my \$LBase = $LBase_string;
16182     my \$VBase = $VBase_string;
16183     my \$TBase = $TBase_string;
16184     my \$SCount = $SCount;
16185     my \$LCount = $LCount;
16186     my \$VCount = $VCount;
16187     my \$TCount = $TCount;
16188     my \$NCount = \$VCount * \$TCount;
16189 END
16190     } # End of has Jamos
16191
16192     push @name, << 'END';
16193
16194     sub name_to_code_point_special {
16195         my ($name, $loose) = @_;
16196
16197         # Returns undef if not one of the specially handled names; otherwise
16198         # returns the code point equivalent to the input name
16199         # $loose is non-zero if to use loose matching, 'name' in that case
16200         # must be input as upper case with all blanks and dashes squeezed out.
16201 END
16202     if ($has_hangul_syllables) {
16203         push @name, << 'END';
16204
16205         if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
16206             || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
16207         {
16208             return if $name !~ qr/^$syllable_re$/;
16209             my $L = $Jamo_L{$1};
16210             my $V = $Jamo_V{$2};
16211             my $T = (defined $3) ? $Jamo_T{$3} : 0;
16212             return ($L * $VCount + $V) * $TCount + $T + $SBase;
16213         }
16214 END
16215     }
16216     push @name, << 'END';
16217
16218         # Name must end in 'code_point' for this to handle.
16219         return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
16220                    || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
16221
16222         my $base = $1;
16223         my $code_point = CORE::hex $2;
16224         my $names_ref;
16225
16226         if ($loose) {
16227             $names_ref = \%loose_names_ending_in_code_point;
16228         }
16229         else {
16230             return if $base !~ s/-$//;
16231             $names_ref = \%names_ending_in_code_point;
16232         }
16233
16234         # Name must be one of the ones which has the code point in it.
16235         return if ! $names_ref->{$base};
16236
16237         # Look through the list of ranges that apply to this name to see if
16238         # the code point is in one of them.
16239         for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
16240             return if $names_ref->{$base}{'low'}->[$i] > $code_point;
16241             next if $names_ref->{$base}{'high'}->[$i] < $code_point;
16242
16243             # Here, the code point is in the range.
16244             return $code_point;
16245         }
16246
16247         # Here, looked like the name had a code point number in it, but
16248         # did not match one of the valid ones.
16249         return;
16250     }
16251
16252     sub code_point_to_name_special {
16253         my $code_point = shift;
16254
16255         # Returns the name of a code point if algorithmically determinable;
16256         # undef if not
16257 END
16258     if ($has_hangul_syllables) {
16259         push @name, << 'END';
16260
16261         # If in the Hangul range, calculate the name based on Unicode's
16262         # algorithm
16263         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
16264             use integer;
16265             my $SIndex = $code_point - $SBase;
16266             my $L = $LBase + $SIndex / $NCount;
16267             my $V = $VBase + ($SIndex % $NCount) / $TCount;
16268             my $T = $TBase + $SIndex % $TCount;
16269             $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
16270             $name .= $Jamo{$T} if $T != $TBase;
16271             return $name;
16272         }
16273 END
16274     }
16275     push @name, << 'END';
16276
16277         # Look through list of these code points for one in range.
16278         foreach my $hash (@code_points_ending_in_code_point) {
16279             return if $code_point < $hash->{'low'};
16280             if ($code_point <= $hash->{'high'}) {
16281                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
16282             }
16283         }
16284         return;            # None found
16285     }
16286 } # End closure
16287
16288 1;
16289 END
16290
16291     main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
16292     return;
16293 }
16294
16295 sub make_UCD () {
16296     # Create and write UCD.pl, which passes info about the tables to
16297     # Unicode::UCD
16298
16299     # Create a mapping from each alias of Perl single-form extensions to all
16300     # its equivalent aliases, for quick look-up.
16301     my %perlprop_to_aliases;
16302     foreach my $table ($perl->tables) {
16303
16304         # First create the list of the aliases of each extension
16305         my @aliases_list;    # List of legal aliases for this extension
16306
16307         my $table_name = $table->name;
16308         my $standard_table_name = standardize($table_name);
16309         my $table_full_name = $table->full_name;
16310         my $standard_table_full_name = standardize($table_full_name);
16311
16312         # Make sure that the list has both the short and full names
16313         push @aliases_list, $table_name, $table_full_name;
16314
16315         my $found_ucd = 0;  # ? Did we actually get an alias that should be
16316                             # output for this table
16317
16318         # Go through all the aliases (including the two just added), and add
16319         # any new unique ones to the list
16320         foreach my $alias ($table->aliases) {
16321
16322             # Skip non-legal names
16323             next unless $alias->ok_as_filename;
16324             next unless $alias->ucd;
16325
16326             $found_ucd = 1;     # have at least one legal name
16327
16328             my $name = $alias->name;
16329             my $standard = standardize($name);
16330
16331             # Don't repeat a name that is equivalent to one already on the
16332             # list
16333             next if $standard eq $standard_table_name;
16334             next if $standard eq $standard_table_full_name;
16335
16336             push @aliases_list, $name;
16337         }
16338
16339         # If there were no legal names, don't output anything.
16340         next unless $found_ucd;
16341
16342         # To conserve memory in the program reading these in, omit full names
16343         # that are identical to the short name, when those are the only two
16344         # aliases for the property.
16345         if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
16346             pop @aliases_list;
16347         }
16348
16349         # Here, @aliases_list is the list of all the aliases that this
16350         # extension legally has.  Now can create a map to it from each legal
16351         # standardized alias
16352         foreach my $alias ($table->aliases) {
16353             next unless $alias->ucd;
16354             next unless $alias->ok_as_filename;
16355             push @{$perlprop_to_aliases{standardize($alias->name)}},
16356                  @aliases_list;
16357         }
16358     }
16359
16360     # Make a list of all combinations of properties/values that are suppressed.
16361     my @suppressed;
16362     if (! $debug_skip) {    # This tends to fail in this debug mode
16363         foreach my $property_name (keys %why_suppressed) {
16364
16365             # Just the value
16366             my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
16367
16368             # The hash may contain properties not in this release of Unicode
16369             next unless defined (my $property = property_ref($property_name));
16370
16371             # Find all combinations
16372             foreach my $prop_alias ($property->aliases) {
16373                 my $prop_alias_name = standardize($prop_alias->name);
16374
16375                 # If no =value, there's just one combination possibe for this
16376                 if (! $value_name) {
16377
16378                     # The property may be suppressed, but there may be a proxy
16379                     # for it, so it shouldn't be listed as suppressed
16380                     next if $prop_alias->ucd;
16381                     push @suppressed, $prop_alias_name;
16382                 }
16383                 else {  # Otherwise
16384                     foreach my $value_alias
16385                                     ($property->table($value_name)->aliases)
16386                     {
16387                         next if $value_alias->ucd;
16388
16389                         push @suppressed, "$prop_alias_name="
16390                                         .  standardize($value_alias->name);
16391                     }
16392                 }
16393             }
16394         }
16395     }
16396     @suppressed = sort @suppressed; # So doesn't change between runs of this
16397                                     # program
16398
16399     # Convert the structure below (designed for Name.pm) to a form that UCD
16400     # wants, so it doesn't have to modify it at all; i.e. so that it includes
16401     # an element for the Hangul syllables in the appropriate place, and
16402     # otherwise changes the name to include the "-<code point>" suffix.
16403     my @algorithm_names;
16404     my $done_hangul = 0;
16405
16406     # Copy it linearly.
16407     for my $i (0 .. @code_points_ending_in_code_point - 1) {
16408
16409         # Insert the hanguls in the correct place.
16410         if (! $done_hangul
16411             && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
16412         {
16413             $done_hangul = 1;
16414             push @algorithm_names, { low => $SBase,
16415                                      high => $SBase + $SCount - 1,
16416                                      name => '<hangul syllable>',
16417                                     };
16418         }
16419
16420         # Copy the current entry, modified.
16421         push @algorithm_names, {
16422             low => $code_points_ending_in_code_point[$i]->{'low'},
16423             high => $code_points_ending_in_code_point[$i]->{'high'},
16424             name =>
16425                "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
16426         };
16427     }
16428
16429     # Serialize these structures for output.
16430     my $loose_to_standard_value
16431                           = simple_dumper(\%loose_to_standard_value, ' ' x 4);
16432     chomp $loose_to_standard_value;
16433
16434     my $string_property_loose_to_name
16435                     = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
16436     chomp $string_property_loose_to_name;
16437
16438     my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
16439     chomp $perlprop_to_aliases;
16440
16441     my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
16442     chomp $prop_aliases;
16443
16444     my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
16445     chomp $prop_value_aliases;
16446
16447     my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
16448     chomp $suppressed;
16449
16450     my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
16451     chomp $algorithm_names;
16452
16453     my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
16454     chomp $ambiguous_names;
16455
16456     my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
16457     chomp $loose_defaults;
16458
16459     my @ucd = <<END;
16460 $HEADER
16461 $INTERNAL_ONLY_HEADER
16462
16463 # This file is for the use of Unicode::UCD
16464
16465 # Highest legal Unicode code point
16466 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
16467
16468 # Hangul syllables
16469 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
16470 \$Unicode::UCD::HANGUL_COUNT = $SCount;
16471
16472 # Keys are all the possible "prop=value" combinations, in loose form; values
16473 # are the standard loose name for the 'value' part of the key
16474 \%Unicode::UCD::loose_to_standard_value = (
16475 $loose_to_standard_value
16476 );
16477
16478 # String property loose names to standard loose name
16479 \%Unicode::UCD::string_property_loose_to_name = (
16480 $string_property_loose_to_name
16481 );
16482
16483 # Keys are Perl extensions in loose form; values are each one's list of
16484 # aliases
16485 \%Unicode::UCD::loose_perlprop_to_name = (
16486 $perlprop_to_aliases
16487 );
16488
16489 # Keys are standard property name; values are each one's aliases
16490 \%Unicode::UCD::prop_aliases = (
16491 $prop_aliases
16492 );
16493
16494 # Keys of top level are standard property name; values are keys to another
16495 # hash,  Each one is one of the property's values, in standard form.  The
16496 # values are that prop-val's aliases.  If only one specified, the short and
16497 # long alias are identical.
16498 \%Unicode::UCD::prop_value_aliases = (
16499 $prop_value_aliases
16500 );
16501
16502 # Ordered (by code point ordinal) list of the ranges of code points whose
16503 # names are algorithmically determined.  Each range entry is an anonymous hash
16504 # of the start and end points and a template for the names within it.
16505 \@Unicode::UCD::algorithmic_named_code_points = (
16506 $algorithm_names
16507 );
16508
16509 # The properties that as-is have two meanings, and which must be disambiguated
16510 \%Unicode::UCD::ambiguous_names = (
16511 $ambiguous_names
16512 );
16513
16514 # Keys are the prop-val combinations which are the default values for the
16515 # given property, expressed in standard loose form
16516 \%Unicode::UCD::loose_defaults = (
16517 $loose_defaults
16518 );
16519
16520 # All combinations of names that are suppressed.
16521 # This is actually for UCD.t, so it knows which properties shouldn't have
16522 # entries.  If it got any bigger, would probably want to put it in its own
16523 # file to use memory only when it was needed, in testing.
16524 \@Unicode::UCD::suppressed_properties = (
16525 $suppressed
16526 );
16527
16528 1;
16529 END
16530
16531     main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
16532     return;
16533 }
16534
16535 sub write_all_tables() {
16536     # Write out all the tables generated by this program to files, as well as
16537     # the supporting data structures, pod file, and .t file.
16538
16539     my @writables;              # List of tables that actually get written
16540     my %match_tables_to_write;  # Used to collapse identical match tables
16541                                 # into one file.  Each key is a hash function
16542                                 # result to partition tables into buckets.
16543                                 # Each value is an array of the tables that
16544                                 # fit in the bucket.
16545
16546     # For each property ...
16547     # (sort so that if there is an immutable file name, it has precedence, so
16548     # some other property can't come in and take over its file name.  (We
16549     # don't care if both defined, as they had better be different anyway.)
16550     # The property named 'Perl' needs to be first (it doesn't have any
16551     # immutable file name) because empty properties are defined in terms of
16552     # it's table named 'Any'.)   We also sort by the property's name.  This is
16553     # just for repeatability of the outputs between runs of this program, but
16554     # does not affect correctness.
16555     PROPERTY:
16556     foreach my $property ($perl,
16557                           sort { return -1 if defined $a->file;
16558                                  return 1 if defined $b->file;
16559                                  return $a->name cmp $b->name;
16560                                 } grep { $_ != $perl } property_ref('*'))
16561     {
16562         my $type = $property->type;
16563
16564         # And for each table for that property, starting with the mapping
16565         # table for it ...
16566         TABLE:
16567         foreach my $table($property,
16568
16569                         # and all the match tables for it (if any), sorted so
16570                         # the ones with the shortest associated file name come
16571                         # first.  The length sorting prevents problems of a
16572                         # longer file taking a name that might have to be used
16573                         # by a shorter one.  The alphabetic sorting prevents
16574                         # differences between releases
16575                         sort {  my $ext_a = $a->external_name;
16576                                 return 1 if ! defined $ext_a;
16577                                 my $ext_b = $b->external_name;
16578                                 return -1 if ! defined $ext_b;
16579
16580                                 # But return the non-complement table before
16581                                 # the complement one, as the latter is defined
16582                                 # in terms of the former, and needs to have
16583                                 # the information for the former available.
16584                                 return 1 if $a->complement != 0;
16585                                 return -1 if $b->complement != 0;
16586
16587                                 # Similarly, return a subservient table after
16588                                 # a leader
16589                                 return 1 if $a->leader != $a;
16590                                 return -1 if $b->leader != $b;
16591
16592                                 my $cmp = length $ext_a <=> length $ext_b;
16593
16594                                 # Return result if lengths not equal
16595                                 return $cmp if $cmp;
16596
16597                                 # Alphabetic if lengths equal
16598                                 return $ext_a cmp $ext_b
16599                         } $property->tables
16600                     )
16601         {
16602
16603             # Here we have a table associated with a property.  It could be
16604             # the map table (done first for each property), or one of the
16605             # other tables.  Determine which type.
16606             my $is_property = $table->isa('Property');
16607
16608             my $name = $table->name;
16609             my $complete_name = $table->complete_name;
16610
16611             # See if should suppress the table if is empty, but warn if it
16612             # contains something.
16613             my $suppress_if_empty_warn_if_not
16614                     = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
16615
16616             # Calculate if this table should have any code points associated
16617             # with it or not.
16618             my $expected_empty =
16619
16620                 # $perl should be empty, as well as properties that we just
16621                 # don't do anything with
16622                 ($is_property
16623                     && ($table == $perl
16624                         || grep { $complete_name eq $_ }
16625                                                     @unimplemented_properties
16626                     )
16627                 )
16628
16629                 # Match tables in properties we skipped populating should be
16630                 # empty
16631                 || (! $is_property && ! $property->to_create_match_tables)
16632
16633                 # Tables and properties that are expected to have no code
16634                 # points should be empty
16635                 || $suppress_if_empty_warn_if_not
16636             ;
16637
16638             # Set a boolean if this table is the complement of an empty binary
16639             # table
16640             my $is_complement_of_empty_binary =
16641                 $type == $BINARY &&
16642                 (($table == $property->table('Y')
16643                     && $property->table('N')->is_empty)
16644                 || ($table == $property->table('N')
16645                     && $property->table('Y')->is_empty));
16646
16647             if ($table->is_empty) {
16648
16649                 if ($suppress_if_empty_warn_if_not) {
16650                     $table->set_fate($SUPPRESSED,
16651                                      $suppress_if_empty_warn_if_not);
16652                 }
16653
16654                 # Suppress (by skipping them) expected empty tables.
16655                 next TABLE if $expected_empty;
16656
16657                 # And setup to later output a warning for those that aren't
16658                 # known to be allowed to be empty.  Don't do the warning if
16659                 # this table is a child of another one to avoid duplicating
16660                 # the warning that should come from the parent one.
16661                 if (($table == $property || $table->parent == $table)
16662                     && $table->fate != $SUPPRESSED
16663                     && $table->fate != $MAP_PROXIED
16664                     && ! grep { $complete_name =~ /^$_$/ }
16665                                                     @tables_that_may_be_empty)
16666                 {
16667                     push @unhandled_properties, "$table";
16668                 }
16669
16670                 # An empty table is just the complement of everything.
16671                 $table->set_complement($Any) if $table != $property;
16672             }
16673             elsif ($expected_empty) {
16674                 my $because = "";
16675                 if ($suppress_if_empty_warn_if_not) {
16676                     $because = " because $suppress_if_empty_warn_if_not";
16677                 }
16678
16679                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
16680             }
16681
16682             # Some tables should match everything
16683             my $expected_full =
16684                 ($table->fate == $SUPPRESSED)
16685                 ? 0
16686                 : ($is_property)
16687                   ? # All these types of map tables will be full because
16688                     # they will have been populated with defaults
16689                     ($type == $ENUM || $type == $FORCED_BINARY)
16690
16691                   : # A match table should match everything if its method
16692                     # shows it should
16693                     ($table->matches_all
16694
16695                     # The complement of an empty binary table will match
16696                     # everything
16697                     || $is_complement_of_empty_binary
16698                     )
16699             ;
16700
16701             my $count = $table->count;
16702             if ($expected_full) {
16703                 if ($count != $MAX_UNICODE_CODEPOINTS) {
16704                     Carp::my_carp("$table matches only "
16705                     . clarify_number($count)
16706                     . " Unicode code points but should match "
16707                     . clarify_number($MAX_UNICODE_CODEPOINTS)
16708                     . " (off by "
16709                     .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
16710                     . ").  Proceeding anyway.");
16711                 }
16712
16713                 # Here is expected to be full.  If it is because it is the
16714                 # complement of an (empty) binary table that is to be
16715                 # suppressed, then suppress this one as well.
16716                 if ($is_complement_of_empty_binary) {
16717                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
16718                     my $opposing = $property->table($opposing_name);
16719                     my $opposing_status = $opposing->status;
16720                     if ($opposing_status) {
16721                         $table->set_status($opposing_status,
16722                                            $opposing->status_info);
16723                     }
16724                 }
16725             }
16726             elsif ($count == $MAX_UNICODE_CODEPOINTS
16727                    && ($table == $property || $table->leader == $table)
16728                    && $table->property->status ne $NORMAL)
16729             {
16730                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
16731             }
16732
16733             if ($table->fate >= $SUPPRESSED) {
16734                 if (! $is_property) {
16735                     my @children = $table->children;
16736                     foreach my $child (@children) {
16737                         if ($child->fate < $SUPPRESSED) {
16738                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
16739                         }
16740                     }
16741                 }
16742                 next TABLE;
16743
16744             }
16745
16746             if (! $is_property) {
16747
16748                 make_ucd_table_pod_entries($table) if $table->property == $perl;
16749
16750                 # Several things need to be done just once for each related
16751                 # group of match tables.  Do them on the parent.
16752                 if ($table->parent == $table) {
16753
16754                     # Add an entry in the pod file for the table; it also does
16755                     # the children.
16756                     make_re_pod_entries($table) if defined $pod_directory;
16757
16758                     # See if the the table matches identical code points with
16759                     # something that has already been output.  In that case,
16760                     # no need to have two files with the same code points in
16761                     # them.  We use the table's hash() method to store these
16762                     # in buckets, so that it is quite likely that if two
16763                     # tables are in the same bucket they will be identical, so
16764                     # don't have to compare tables frequently.  The tables
16765                     # have to have the same status to share a file, so add
16766                     # this to the bucket hash.  (The reason for this latter is
16767                     # that Heavy.pl associates a status with a file.)
16768                     # We don't check tables that are inverses of others, as it
16769                     # would lead to some coding complications, and checking
16770                     # all the regular ones should find everything.
16771                     if ($table->complement == 0) {
16772                         my $hash = $table->hash . ';' . $table->status;
16773
16774                         # Look at each table that is in the same bucket as
16775                         # this one would be.
16776                         foreach my $comparison
16777                                             (@{$match_tables_to_write{$hash}})
16778                         {
16779                             if ($table->matches_identically_to($comparison)) {
16780                                 $table->set_equivalent_to($comparison,
16781                                                                 Related => 0);
16782                                 next TABLE;
16783                             }
16784                         }
16785
16786                         # Here, not equivalent, add this table to the bucket.
16787                         push @{$match_tables_to_write{$hash}}, $table;
16788                     }
16789                 }
16790             }
16791             else {
16792
16793                 # Here is the property itself.
16794                 # Don't write out or make references to the $perl property
16795                 next if $table == $perl;
16796
16797                 make_ucd_table_pod_entries($table);
16798
16799                 # There is a mapping stored of the various synonyms to the
16800                 # standardized name of the property for utf8_heavy.pl.
16801                 # Also, the pod file contains entries of the form:
16802                 # \p{alias: *}         \p{full: *}
16803                 # rather than show every possible combination of things.
16804
16805                 my @property_aliases = $property->aliases;
16806
16807                 my $full_property_name = $property->full_name;
16808                 my $property_name = $property->name;
16809                 my $standard_property_name = standardize($property_name);
16810                 my $standard_property_full_name
16811                                         = standardize($full_property_name);
16812
16813                 # We also create for Unicode::UCD a list of aliases for
16814                 # the property.  The list starts with the property name;
16815                 # then its full name.  Legacy properties are not listed in
16816                 # Unicode::UCD.
16817                 my @property_list;
16818                 my @standard_list;
16819                 if ( $property->fate <= $MAP_PROXIED) {
16820                     @property_list = ($property_name, $full_property_name);
16821                     @standard_list = ($standard_property_name,
16822                                         $standard_property_full_name);
16823                 }
16824
16825                 # For each synonym ...
16826                 for my $i (0 .. @property_aliases - 1)  {
16827                     my $alias = $property_aliases[$i];
16828                     my $alias_name = $alias->name;
16829                     my $alias_standard = standardize($alias_name);
16830
16831
16832                     # Add other aliases to the list of property aliases
16833                     if ($property->fate <= $MAP_PROXIED
16834                         && ! grep { $alias_standard eq $_ } @standard_list)
16835                     {
16836                         push @property_list, $alias_name;
16837                         push @standard_list, $alias_standard;
16838                     }
16839
16840                     # For utf8_heavy, set the mapping of the alias to the
16841                     # property
16842                     if ($type == $STRING) {
16843                         if ($property->fate <= $MAP_PROXIED) {
16844                             $string_property_loose_to_name{$alias_standard}
16845                                             = $standard_property_name;
16846                         }
16847                     }
16848                     else {
16849                         if (exists ($loose_property_name_of{$alias_standard}))
16850                         {
16851                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
16852                         }
16853                         else {
16854                             $loose_property_name_of{$alias_standard}
16855                                                 = $standard_property_name;
16856                         }
16857
16858                         # Now for the re pod entry for this alias.  Skip if not
16859                         # outputting a pod; skip the first one, which is the
16860                         # full name so won't have an entry like: '\p{full: *}
16861                         # \p{full: *}', and skip if don't want an entry for
16862                         # this one.
16863                         next if $i == 0
16864                                 || ! defined $pod_directory
16865                                 || ! $alias->make_re_pod_entry;
16866
16867                         my $rhs = "\\p{$full_property_name: *}";
16868                         if ($property != $perl && $table->perl_extension) {
16869                             $rhs .= ' (Perl extension)';
16870                         }
16871                         push @match_properties,
16872                             format_pod_line($indent_info_column,
16873                                         '\p{' . $alias->name . ': *}',
16874                                         $rhs,
16875                                         $alias->status);
16876                     }
16877                 }
16878
16879                 # The list of all possible names is attached to each alias, so
16880                 # lookup is easy
16881                 if (@property_list) {
16882                     push @{$prop_aliases{$standard_list[0]}}, @property_list;
16883                 }
16884
16885                 if ($property->fate <= $MAP_PROXIED) {
16886
16887                     # Similarly, we create for Unicode::UCD a list of
16888                     # property-value aliases.
16889
16890                     my $property_full_name = $property->full_name;
16891
16892                     # Look at each table in the property...
16893                     foreach my $table ($property->tables) {
16894                         my @values_list;
16895                         my $table_full_name = $table->full_name;
16896                         my $standard_table_full_name
16897                                               = standardize($table_full_name);
16898                         my $table_name = $table->name;
16899                         my $standard_table_name = standardize($table_name);
16900
16901                         # The list starts with the table name and its full
16902                         # name.
16903                         push @values_list, $table_name, $table_full_name;
16904
16905                         # We add to the table each unique alias that isn't
16906                         # discouraged from use.
16907                         foreach my $alias ($table->aliases) {
16908                             next if $alias->status
16909                                  && $alias->status eq $DISCOURAGED;
16910                             my $name = $alias->name;
16911                             my $standard = standardize($name);
16912                             next if $standard eq $standard_table_name;
16913                             next if $standard eq $standard_table_full_name;
16914                             push @values_list, $name;
16915                         }
16916
16917                         # Here @values_list is a list of all the aliases for
16918                         # the table.  That is, all the property-values given
16919                         # by this table.  By agreement with Unicode::UCD,
16920                         # if the name and full name are identical, and there
16921                         # are no other names, drop the duplcate entry to save
16922                         # memory.
16923                         if (@values_list == 2
16924                             && $values_list[0] eq $values_list[1])
16925                         {
16926                             pop @values_list
16927                         }
16928
16929                         # To save memory, unlike the similar list for property
16930                         # aliases above, only the standard forms hve the list.
16931                         # This forces an extra step of converting from input
16932                         # name to standard name, but the savings are
16933                         # considerable.  (There is only marginal savings if we
16934                         # did this with the property aliases.)
16935                         push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
16936                     }
16937                 }
16938
16939                 # Don't write out a mapping file if not desired.
16940                 next if ! $property->to_output_map;
16941             }
16942
16943             # Here, we know we want to write out the table, but don't do it
16944             # yet because there may be other tables that come along and will
16945             # want to share the file, and the file's comments will change to
16946             # mention them.  So save for later.
16947             push @writables, $table;
16948
16949         } # End of looping through the property and all its tables.
16950     } # End of looping through all properties.
16951
16952     # Now have all the tables that will have files written for them.  Do it.
16953     foreach my $table (@writables) {
16954         my @directory;
16955         my $filename;
16956         my $property = $table->property;
16957         my $is_property = ($table == $property);
16958         if (! $is_property) {
16959
16960             # Match tables for the property go in lib/$subdirectory, which is
16961             # the property's name.  Don't use the standard file name for this,
16962             # as may get an unfamiliar alias
16963             @directory = ($matches_directory, $property->external_name);
16964         }
16965         else {
16966
16967             @directory = $table->directory;
16968             $filename = $table->file;
16969         }
16970
16971         # Use specified filename if available, or default to property's
16972         # shortest name.  We need an 8.3 safe filename (which means "an 8
16973         # safe" filename, since after the dot is only 'pl', which is < 3)
16974         # The 2nd parameter is if the filename shouldn't be changed, and
16975         # it shouldn't iff there is a hard-coded name for this table.
16976         $filename = construct_filename(
16977                                 $filename || $table->external_name,
16978                                 ! $filename,    # mutable if no filename
16979                                 \@directory);
16980
16981         register_file_for_name($table, \@directory, $filename);
16982
16983         # Only need to write one file when shared by more than one
16984         # property
16985         next if ! $is_property
16986                 && ($table->leader != $table || $table->complement != 0);
16987
16988         # Construct a nice comment to add to the file
16989         $table->set_final_comment;
16990
16991         $table->write;
16992     }
16993
16994
16995     # Write out the pod file
16996     make_pod;
16997
16998     # And Heavy.pl, Name.pm, UCD.pl
16999     make_Heavy;
17000     make_Name_pm;
17001     make_UCD;
17002
17003     make_property_test_script() if $make_test_script;
17004     make_normalization_test_script() if $make_norm_test_script;
17005     return;
17006 }
17007
17008 my @white_space_separators = ( # This used only for making the test script.
17009                             "",
17010                             ' ',
17011                             "\t",
17012                             '   '
17013                         );
17014
17015 sub generate_separator($) {
17016     # This used only for making the test script.  It generates the colon or
17017     # equal separator between the property and property value, with random
17018     # white space surrounding the separator
17019
17020     my $lhs = shift;
17021
17022     return "" if $lhs eq "";  # No separator if there's only one (the r) side
17023
17024     # Choose space before and after randomly
17025     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
17026     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
17027
17028     # And return the whole complex, half the time using a colon, half the
17029     # equals
17030     return $spaces_before
17031             . (rand() < 0.5) ? '=' : ':'
17032             . $spaces_after;
17033 }
17034
17035 sub generate_tests($$$$$) {
17036     # This used only for making the test script.  It generates test cases that
17037     # are expected to compile successfully in perl.  Note that the lhs and
17038     # rhs are assumed to already be as randomized as the caller wants.
17039
17040     my $lhs = shift;           # The property: what's to the left of the colon
17041                                #  or equals separator
17042     my $rhs = shift;           # The property value; what's to the right
17043     my $valid_code = shift;    # A code point that's known to be in the
17044                                # table given by lhs=rhs; undef if table is
17045                                # empty
17046     my $invalid_code = shift;  # A code point known to not be in the table;
17047                                # undef if the table is all code points
17048     my $warning = shift;
17049
17050     # Get the colon or equal
17051     my $separator = generate_separator($lhs);
17052
17053     # The whole 'property=value'
17054     my $name = "$lhs$separator$rhs";
17055
17056     my @output;
17057     # Create a complete set of tests, with complements.
17058     if (defined $valid_code) {
17059         push @output, <<"EOC"
17060 Expect(1, $valid_code, '\\p{$name}', $warning);
17061 Expect(0, $valid_code, '\\p{^$name}', $warning);
17062 Expect(0, $valid_code, '\\P{$name}', $warning);
17063 Expect(1, $valid_code, '\\P{^$name}', $warning);
17064 EOC
17065     }
17066     if (defined $invalid_code) {
17067         push @output, <<"EOC"
17068 Expect(0, $invalid_code, '\\p{$name}', $warning);
17069 Expect(1, $invalid_code, '\\p{^$name}', $warning);
17070 Expect(1, $invalid_code, '\\P{$name}', $warning);
17071 Expect(0, $invalid_code, '\\P{^$name}', $warning);
17072 EOC
17073     }
17074     return @output;
17075 }
17076
17077 sub generate_error($$$) {
17078     # This used only for making the test script.  It generates test cases that
17079     # are expected to not only not match, but to be syntax or similar errors
17080
17081     my $lhs = shift;                # The property: what's to the left of the
17082                                     # colon or equals separator
17083     my $rhs = shift;                # The property value; what's to the right
17084     my $already_in_error = shift;   # Boolean; if true it's known that the
17085                                 # unmodified lhs and rhs will cause an error.
17086                                 # This routine should not force another one
17087     # Get the colon or equal
17088     my $separator = generate_separator($lhs);
17089
17090     # Since this is an error only, don't bother to randomly decide whether to
17091     # put the error on the left or right side; and assume that the rhs is
17092     # loosely matched, again for convenience rather than rigor.
17093     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
17094
17095     my $property = $lhs . $separator . $rhs;
17096
17097     return <<"EOC";
17098 Error('\\p{$property}');
17099 Error('\\P{$property}');
17100 EOC
17101 }
17102
17103 # These are used only for making the test script
17104 # XXX Maybe should also have a bad strict seps, which includes underscore.
17105
17106 my @good_loose_seps = (
17107             " ",
17108             "-",
17109             "\t",
17110             "",
17111             "_",
17112            );
17113 my @bad_loose_seps = (
17114            "/a/",
17115            ':=',
17116           );
17117
17118 sub randomize_stricter_name {
17119     # This used only for making the test script.  Take the input name and
17120     # return a randomized, but valid version of it under the stricter matching
17121     # rules.
17122
17123     my $name = shift;
17124     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
17125
17126     # If the name looks like a number (integer, floating, or rational), do
17127     # some extra work
17128     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
17129         my $sign = $1;
17130         my $number = $2;
17131         my $separator = $3;
17132
17133         # If there isn't a sign, part of the time add a plus
17134         # Note: Not testing having any denominator having a minus sign
17135         if (! $sign) {
17136             $sign = '+' if rand() <= .3;
17137         }
17138
17139         # And add 0 or more leading zeros.
17140         $name = $sign . ('0' x int rand(10)) . $number;
17141
17142         if (defined $separator) {
17143             my $extra_zeros = '0' x int rand(10);
17144
17145             if ($separator eq '.') {
17146
17147                 # Similarly, add 0 or more trailing zeros after a decimal
17148                 # point
17149                 $name .= $extra_zeros;
17150             }
17151             else {
17152
17153                 # Or, leading zeros before the denominator
17154                 $name =~ s,/,/$extra_zeros,;
17155             }
17156         }
17157     }
17158
17159     # For legibility of the test, only change the case of whole sections at a
17160     # time.  To do this, first split into sections.  The split returns the
17161     # delimiters
17162     my @sections;
17163     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
17164         trace $section if main::DEBUG && $to_trace;
17165
17166         if (length $section > 1 && $section !~ /\D/) {
17167
17168             # If the section is a sequence of digits, about half the time
17169             # randomly add underscores between some of them.
17170             if (rand() > .5) {
17171
17172                 # Figure out how many underscores to add.  max is 1 less than
17173                 # the number of digits.  (But add 1 at the end to make sure
17174                 # result isn't 0, and compensate earlier by subtracting 2
17175                 # instead of 1)
17176                 my $num_underscores = int rand(length($section) - 2) + 1;
17177
17178                 # And add them evenly throughout, for convenience, not rigor
17179                 use integer;
17180                 my $spacing = (length($section) - 1)/ $num_underscores;
17181                 my $temp = $section;
17182                 $section = "";
17183                 for my $i (1 .. $num_underscores) {
17184                     $section .= substr($temp, 0, $spacing, "") . '_';
17185                 }
17186                 $section .= $temp;
17187             }
17188             push @sections, $section;
17189         }
17190         else {
17191
17192             # Here not a sequence of digits.  Change the case of the section
17193             # randomly
17194             my $switch = int rand(4);
17195             if ($switch == 0) {
17196                 push @sections, uc $section;
17197             }
17198             elsif ($switch == 1) {
17199                 push @sections, lc $section;
17200             }
17201             elsif ($switch == 2) {
17202                 push @sections, ucfirst $section;
17203             }
17204             else {
17205                 push @sections, $section;
17206             }
17207         }
17208     }
17209     trace "returning", join "", @sections if main::DEBUG && $to_trace;
17210     return join "", @sections;
17211 }
17212
17213 sub randomize_loose_name($;$) {
17214     # This used only for making the test script
17215
17216     my $name = shift;
17217     my $want_error = shift;  # if true, make an error
17218     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
17219
17220     $name = randomize_stricter_name($name);
17221
17222     my @parts;
17223     push @parts, $good_loose_seps[rand(@good_loose_seps)];
17224
17225     # Preserve trailing ones for the sake of not stripping the underscore from
17226     # 'L_'
17227     for my $part (split /[-\s_]+ (?= . )/, $name) {
17228         if (@parts) {
17229             if ($want_error and rand() < 0.3) {
17230                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
17231                 $want_error = 0;
17232             }
17233             else {
17234                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
17235             }
17236         }
17237         push @parts, $part;
17238     }
17239     my $new = join("", @parts);
17240     trace "$name => $new" if main::DEBUG && $to_trace;
17241
17242     if ($want_error) {
17243         if (rand() >= 0.5) {
17244             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
17245         }
17246         else {
17247             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
17248         }
17249     }
17250     return $new;
17251 }
17252
17253 # Used to make sure don't generate duplicate test cases.
17254 my %test_generated;
17255
17256 sub make_property_test_script() {
17257     # This used only for making the test script
17258     # this written directly -- it's huge.
17259
17260     print "Making test script\n" if $verbosity >= $PROGRESS;
17261
17262     # This uses randomness to test different possibilities without testing all
17263     # possibilities.  To ensure repeatability, set the seed to 0.  But if
17264     # tests are added, it will perturb all later ones in the .t file
17265     srand 0;
17266
17267     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
17268
17269     # Keep going down an order of magnitude
17270     # until find that adding this quantity to
17271     # 1 remains 1; but put an upper limit on
17272     # this so in case this algorithm doesn't
17273     # work properly on some platform, that we
17274     # won't loop forever.
17275     my $digits = 0;
17276     my $min_floating_slop = 1;
17277     while (1+ $min_floating_slop != 1
17278             && $digits++ < 50)
17279     {
17280         my $next = $min_floating_slop / 10;
17281         last if $next == 0; # If underflows,
17282                             # use previous one
17283         $min_floating_slop = $next;
17284     }
17285
17286     # It doesn't matter whether the elements of this array contain single lines
17287     # or multiple lines. main::write doesn't count the lines.
17288     my @output;
17289
17290     # Sort these so get results in same order on different runs of this
17291     # program
17292     foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) {
17293         foreach my $table (sort { $a->name cmp $b->name } $property->tables) {
17294
17295             # Find code points that match, and don't match this table.
17296             my $valid = $table->get_valid_code_point;
17297             my $invalid = $table->get_invalid_code_point;
17298             my $warning = ($table->status eq $DEPRECATED)
17299                             ? "'deprecated'"
17300                             : '""';
17301
17302             # Test each possible combination of the property's aliases with
17303             # the table's.  If this gets to be too many, could do what is done
17304             # in the set_final_comment() for Tables
17305             my @table_aliases = $table->aliases;
17306             my @property_aliases = $table->property->aliases;
17307
17308             # Every property can be optionally be prefixed by 'Is_', so test
17309             # that those work, by creating such a new alias for each
17310             # pre-existing one.
17311             push @property_aliases, map { Alias->new("Is_" . $_->name,
17312                                                     $_->loose_match,
17313                                                     $_->make_re_pod_entry,
17314                                                     $_->ok_as_filename,
17315                                                     $_->status,
17316                                                     $_->ucd,
17317                                                     )
17318                                          } @property_aliases;
17319             my $max = max(scalar @table_aliases, scalar @property_aliases);
17320             for my $j (0 .. $max - 1) {
17321
17322                 # The current alias for property is the next one on the list,
17323                 # or if beyond the end, start over.  Similarly for table
17324                 my $property_name
17325                             = $property_aliases[$j % @property_aliases]->name;
17326
17327                 $property_name = "" if $table->property == $perl;
17328                 my $table_alias = $table_aliases[$j % @table_aliases];
17329                 my $table_name = $table_alias->name;
17330                 my $loose_match = $table_alias->loose_match;
17331
17332                 # If the table doesn't have a file, any test for it is
17333                 # already guaranteed to be in error
17334                 my $already_error = ! $table->file_path;
17335
17336                 # Generate error cases for this alias.
17337                 push @output, generate_error($property_name,
17338                                              $table_name,
17339                                              $already_error);
17340
17341                 # If the table is guaranteed to always generate an error,
17342                 # quit now without generating success cases.
17343                 next if $already_error;
17344
17345                 # Now for the success cases.
17346                 my $random;
17347                 if ($loose_match) {
17348
17349                     # For loose matching, create an extra test case for the
17350                     # standard name.
17351                     my $standard = standardize($table_name);
17352
17353                     # $test_name should be a unique combination for each test
17354                     # case; used just to avoid duplicate tests
17355                     my $test_name = "$property_name=$standard";
17356
17357                     # Don't output duplicate test cases.
17358                     if (! exists $test_generated{$test_name}) {
17359                         $test_generated{$test_name} = 1;
17360                         push @output, generate_tests($property_name,
17361                                                      $standard,
17362                                                      $valid,
17363                                                      $invalid,
17364                                                      $warning,
17365                                                  );
17366                     }
17367                     $random = randomize_loose_name($table_name)
17368                 }
17369                 else { # Stricter match
17370                     $random = randomize_stricter_name($table_name);
17371                 }
17372
17373                 # Now for the main test case for this alias.
17374                 my $test_name = "$property_name=$random";
17375                 if (! exists $test_generated{$test_name}) {
17376                     $test_generated{$test_name} = 1;
17377                     push @output, generate_tests($property_name,
17378                                                  $random,
17379                                                  $valid,
17380                                                  $invalid,
17381                                                  $warning,
17382                                              );
17383
17384                     # If the name is a rational number, add tests for the
17385                     # floating point equivalent.
17386                     if ($table_name =~ qr{/}) {
17387
17388                         # Calculate the float, and find just the fraction.
17389                         my $float = eval $table_name;
17390                         my ($whole, $fraction)
17391                                             = $float =~ / (.*) \. (.*) /x;
17392
17393                         # Starting with one digit after the decimal point,
17394                         # create a test for each possible precision (number of
17395                         # digits past the decimal point) until well beyond the
17396                         # native number found on this machine.  (If we started
17397                         # with 0 digits, it would be an integer, which could
17398                         # well match an unrelated table)
17399                         PLACE:
17400                         for my $i (1 .. $min_floating_slop + 3) {
17401                             my $table_name = sprintf("%.*f", $i, $float);
17402                             if ($i < $MIN_FRACTION_LENGTH) {
17403
17404                                 # If the test case has fewer digits than the
17405                                 # minimum acceptable precision, it shouldn't
17406                                 # succeed, so we expect an error for it.
17407                                 # E.g., 2/3 = .7 at one decimal point, and we
17408                                 # shouldn't say it matches .7.  We should make
17409                                 # it be .667 at least before agreeing that the
17410                                 # intent was to match 2/3.  But at the
17411                                 # less-than- acceptable level of precision, it
17412                                 # might actually match an unrelated number.
17413                                 # So don't generate a test case if this
17414                                 # conflating is possible.  In our example, we
17415                                 # don't want 2/3 matching 7/10, if there is
17416                                 # a 7/10 code point.
17417                                 for my $existing
17418                                         (keys %nv_floating_to_rational)
17419                                 {
17420                                     next PLACE
17421                                         if abs($table_name - $existing)
17422                                                 < $MAX_FLOATING_SLOP;
17423                                 }
17424                                 push @output, generate_error($property_name,
17425                                                              $table_name,
17426                                                              1   # 1 => already an error
17427                                               );
17428                             }
17429                             else {
17430
17431                                 # Here the number of digits exceeds the
17432                                 # minimum we think is needed.  So generate a
17433                                 # success test case for it.
17434                                 push @output, generate_tests($property_name,
17435                                                              $table_name,
17436                                                              $valid,
17437                                                              $invalid,
17438                                                              $warning,
17439                                              );
17440                             }
17441                         }
17442                     }
17443                 }
17444             }
17445         }
17446     }
17447
17448     &write($t_path,
17449            0,           # Not utf8;
17450            [<DATA>,
17451             @output,
17452             (map {"Test_X('$_');\n"} @backslash_X_tests),
17453             "Finished();\n"]);
17454     return;
17455 }
17456
17457 sub make_normalization_test_script() {
17458     print "Making normalization test script\n" if $verbosity >= $PROGRESS;
17459
17460     my $n_path = 'TestNorm.pl';
17461
17462     unshift @normalization_tests, <<'END';
17463 use utf8;
17464 use Test::More;
17465
17466 sub ord_string {    # Convert packed ords to printable string
17467     use charnames ();
17468     return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
17469                                                 unpack "U*", shift) .  "'";
17470     #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
17471 }
17472
17473 sub Test_N {
17474     my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
17475     my $display_source = ord_string($source);
17476     my $display_nfc = ord_string($nfc);
17477     my $display_nfd = ord_string($nfd);
17478     my $display_nfkc = ord_string($nfkc);
17479     my $display_nfkd = ord_string($nfkd);
17480
17481     use Unicode::Normalize;
17482     #    NFC
17483     #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
17484     #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
17485     #
17486     #    NFD
17487     #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
17488     #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
17489     #
17490     #    NFKC
17491     #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
17492     #      toNFKC(nfkc) == toNFKC(nfkd)
17493     #
17494     #    NFKD
17495     #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
17496     #      toNFKD(nfkc) == toNFKD(nfkd)
17497
17498     is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
17499     is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
17500     is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
17501     is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
17502     is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
17503
17504     is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
17505     is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
17506     is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
17507     is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
17508     is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
17509
17510     is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
17511     is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
17512     is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
17513     is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
17514     is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
17515
17516     is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
17517     is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
17518     is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
17519     is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
17520     is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
17521 }
17522 END
17523
17524     &write($n_path,
17525            1,           # Is utf8;
17526            [
17527             @normalization_tests,
17528             'done_testing();'
17529             ]);
17530     return;
17531 }
17532
17533 # This is a list of the input files and how to handle them.  The files are
17534 # processed in their order in this list.  Some reordering is possible if
17535 # desired, but the v0 files should be first, and the extracted before the
17536 # others except DAge.txt (as data in an extracted file can be over-ridden by
17537 # the non-extracted.  Some other files depend on data derived from an earlier
17538 # file, like UnicodeData requires data from Jamo, and the case changing and
17539 # folding requires data from Unicode.  Mostly, it is safest to order by first
17540 # version releases in (except the Jamo).  DAge.txt is read before the
17541 # extracted ones because of the rarely used feature $compare_versions.  In the
17542 # unlikely event that there were ever an extracted file that contained the Age
17543 # property information, it would have to go in front of DAge.
17544 #
17545 # The version strings allow the program to know whether to expect a file or
17546 # not, but if a file exists in the directory, it will be processed, even if it
17547 # is in a version earlier than expected, so you can copy files from a later
17548 # release into an earlier release's directory.
17549 my @input_file_objects = (
17550     Input_file->new('PropertyAliases.txt', v0,
17551                     Handler => \&process_PropertyAliases,
17552                     ),
17553     Input_file->new(undef, v0,  # No file associated with this
17554                     Progress_Message => 'Finishing property setup',
17555                     Handler => \&finish_property_setup,
17556                     ),
17557     Input_file->new('PropValueAliases.txt', v0,
17558                      Handler => \&process_PropValueAliases,
17559                      Has_Missings_Defaults => $NOT_IGNORED,
17560                      ),
17561     Input_file->new('DAge.txt', v3.2.0,
17562                     Has_Missings_Defaults => $NOT_IGNORED,
17563                     Property => 'Age'
17564                     ),
17565     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
17566                     Property => 'General_Category',
17567                     ),
17568     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
17569                     Property => 'Canonical_Combining_Class',
17570                     Has_Missings_Defaults => $NOT_IGNORED,
17571                     ),
17572     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
17573                     Property => 'Numeric_Type',
17574                     Has_Missings_Defaults => $NOT_IGNORED,
17575                     ),
17576     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
17577                     Property => 'East_Asian_Width',
17578                     Has_Missings_Defaults => $NOT_IGNORED,
17579                     ),
17580     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
17581                     Property => 'Line_Break',
17582                     Has_Missings_Defaults => $NOT_IGNORED,
17583                     ),
17584     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
17585                     Property => 'Bidi_Class',
17586                     Has_Missings_Defaults => $NOT_IGNORED,
17587                     ),
17588     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
17589                     Property => 'Decomposition_Type',
17590                     Has_Missings_Defaults => $NOT_IGNORED,
17591                     ),
17592     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
17593     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
17594                     Property => 'Numeric_Value',
17595                     Each_Line_Handler => \&filter_numeric_value_line,
17596                     Has_Missings_Defaults => $NOT_IGNORED,
17597                     ),
17598     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
17599                     Property => 'Joining_Group',
17600                     Has_Missings_Defaults => $NOT_IGNORED,
17601                     ),
17602
17603     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
17604                     Property => 'Joining_Type',
17605                     Has_Missings_Defaults => $NOT_IGNORED,
17606                     ),
17607     Input_file->new('Jamo.txt', v2.0.0,
17608                     Property => 'Jamo_Short_Name',
17609                     Each_Line_Handler => \&filter_jamo_line,
17610                     ),
17611     Input_file->new('UnicodeData.txt', v1.1.5,
17612                     Pre_Handler => \&setup_UnicodeData,
17613
17614                     # We clean up this file for some early versions.
17615                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
17616                                             ? \&filter_v1_ucd
17617                                             : ($v_version eq v2.1.5)
17618                                                 ? \&filter_v2_1_5_ucd
17619
17620                                                 # And for 5.14 Perls with 6.0,
17621                                                 # have to also make changes
17622                                                 : ($v_version ge v6.0.0
17623                                                    && $^V lt v5.17.0)
17624                                                     ? \&filter_v6_ucd
17625                                                     : undef),
17626
17627                                             # Early versions did not have the
17628                                             # proper Unicode_1 names for the
17629                                             # controls
17630                                             (($v_version lt v3.0.0)
17631                                             ? \&filter_early_U1_names
17632                                             : undef),
17633
17634                                             # Early versions did not correctly
17635                                             # use the later method for giving
17636                                             # decimal digit values
17637                                             (($v_version le v3.2.0)
17638                                             ? \&filter_bad_Nd_ucd
17639                                             : undef),
17640
17641                                             # And the main filter
17642                                             \&filter_UnicodeData_line,
17643                                          ],
17644                     EOF_Handler => \&EOF_UnicodeData,
17645                     ),
17646     Input_file->new('ArabicShaping.txt', v2.0.0,
17647                     Each_Line_Handler =>
17648                         ($v_version lt 4.1.0)
17649                                     ? \&filter_old_style_arabic_shaping
17650                                     : undef,
17651                     # The first field after the range is a "schematic name"
17652                     # not used by Perl
17653                     Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
17654                     Has_Missings_Defaults => $NOT_IGNORED,
17655                     ),
17656     Input_file->new('Blocks.txt', v2.0.0,
17657                     Property => 'Block',
17658                     Has_Missings_Defaults => $NOT_IGNORED,
17659                     Each_Line_Handler => \&filter_blocks_lines
17660                     ),
17661     Input_file->new('PropList.txt', v2.0.0,
17662                     Each_Line_Handler => (($v_version lt v3.1.0)
17663                                             ? \&filter_old_style_proplist
17664                                             : undef),
17665                     ),
17666     Input_file->new('Unihan.txt', v2.0.0,
17667                     Pre_Handler => \&setup_unihan,
17668                     Optional => 1,
17669                     Each_Line_Handler => \&filter_unihan_line,
17670                         ),
17671     Input_file->new('SpecialCasing.txt', v2.1.8,
17672                     Each_Line_Handler => ($v_version eq 2.1.8)
17673                                          ? \&filter_2_1_8_special_casing_line
17674                                          : \&filter_special_casing_line,
17675                     Pre_Handler => \&setup_special_casing,
17676                     Has_Missings_Defaults => $IGNORED,
17677                     ),
17678     Input_file->new(
17679                     'LineBreak.txt', v3.0.0,
17680                     Has_Missings_Defaults => $NOT_IGNORED,
17681                     Property => 'Line_Break',
17682                     # Early versions had problematic syntax
17683                     Each_Line_Handler => (($v_version lt v3.1.0)
17684                                         ? \&filter_early_ea_lb
17685                                         : undef),
17686                     ),
17687     Input_file->new('EastAsianWidth.txt', v3.0.0,
17688                     Property => 'East_Asian_Width',
17689                     Has_Missings_Defaults => $NOT_IGNORED,
17690                     # Early versions had problematic syntax
17691                     Each_Line_Handler => (($v_version lt v3.1.0)
17692                                         ? \&filter_early_ea_lb
17693                                         : undef),
17694                     ),
17695     Input_file->new('CompositionExclusions.txt', v3.0.0,
17696                     Property => 'Composition_Exclusion',
17697                     ),
17698     Input_file->new('BidiMirroring.txt', v3.0.1,
17699                     Property => 'Bidi_Mirroring_Glyph',
17700                     Has_Missings_Defaults => ($v_version lt v6.2.0)
17701                                               ? $NO_DEFAULTS
17702                                               # Is <none> which doesn't mean
17703                                               # anything to us, we will use the
17704                                               # null string
17705                                               : $IGNORED,
17706
17707                     ),
17708     Input_file->new("NormTest.txt", v3.0.0,
17709                      Handler => \&process_NormalizationsTest,
17710                      Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
17711                     ),
17712     Input_file->new('CaseFolding.txt', v3.0.1,
17713                     Pre_Handler => \&setup_case_folding,
17714                     Each_Line_Handler =>
17715                         [ ($v_version lt v3.1.0)
17716                                  ? \&filter_old_style_case_folding
17717                                  : undef,
17718                            \&filter_case_folding_line
17719                         ],
17720                     Has_Missings_Defaults => $IGNORED,
17721                     ),
17722     Input_file->new('DCoreProperties.txt', v3.1.0,
17723                     # 5.2 changed this file
17724                     Has_Missings_Defaults => (($v_version ge v5.2.0)
17725                                             ? $NOT_IGNORED
17726                                             : $NO_DEFAULTS),
17727                     ),
17728     Input_file->new('Scripts.txt', v3.1.0,
17729                     Property => 'Script',
17730                     Has_Missings_Defaults => $NOT_IGNORED,
17731                     ),
17732     Input_file->new('DNormalizationProps.txt', v3.1.0,
17733                     Has_Missings_Defaults => $NOT_IGNORED,
17734                     Each_Line_Handler => (($v_version lt v4.0.1)
17735                                       ? \&filter_old_style_normalization_lines
17736                                       : undef),
17737                     ),
17738     Input_file->new('HangulSyllableType.txt', v0,
17739                     Has_Missings_Defaults => $NOT_IGNORED,
17740                     Property => 'Hangul_Syllable_Type',
17741                     Pre_Handler => ($v_version lt v4.0.0)
17742                                    ? \&generate_hst
17743                                    : undef,
17744                     ),
17745     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
17746                     Property => 'Word_Break',
17747                     Has_Missings_Defaults => $NOT_IGNORED,
17748                     ),
17749     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
17750                     Property => 'Grapheme_Cluster_Break',
17751                     Has_Missings_Defaults => $NOT_IGNORED,
17752                     Pre_Handler => ($v_version lt v4.1.0)
17753                                    ? \&generate_GCB
17754                                    : undef,
17755                     ),
17756     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
17757                     Handler => \&process_GCB_test,
17758                     ),
17759     Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
17760                     Skip => 'Validation Tests',
17761                     ),
17762     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
17763                     Skip => 'Validation Tests',
17764                     ),
17765     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
17766                     Skip => 'Validation Tests',
17767                     ),
17768     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
17769                     Property => 'Sentence_Break',
17770                     Has_Missings_Defaults => $NOT_IGNORED,
17771                     ),
17772     Input_file->new('NamedSequences.txt', v4.1.0,
17773                     Handler => \&process_NamedSequences
17774                     ),
17775     Input_file->new('NameAliases.txt', v0,
17776                     Property => 'Name_Alias',
17777                     Pre_Handler => ($v_version le v6.0.0)
17778                                    ? \&setup_early_name_alias
17779                                    : undef,
17780                     Each_Line_Handler => ($v_version le v6.0.0)
17781                                    ? \&filter_early_version_name_alias_line
17782                                    : \&filter_later_version_name_alias_line,
17783                     ),
17784     Input_file->new("BidiTest.txt", v5.2.0,
17785                     Skip => 'Validation Tests',
17786                     ),
17787     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
17788                     Optional => 1,
17789                     Each_Line_Handler => \&filter_unihan_line,
17790                     ),
17791     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
17792                     Optional => 1,
17793                     Each_Line_Handler => \&filter_unihan_line,
17794                     ),
17795     Input_file->new('UnihanIRGSources.txt', v5.2.0,
17796                     Optional => 1,
17797                     Pre_Handler => \&setup_unihan,
17798                     Each_Line_Handler => \&filter_unihan_line,
17799                     ),
17800     Input_file->new('UnihanNumericValues.txt', v5.2.0,
17801                     Optional => 1,
17802                     Each_Line_Handler => \&filter_unihan_line,
17803                     ),
17804     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
17805                     Optional => 1,
17806                     Each_Line_Handler => \&filter_unihan_line,
17807                     ),
17808     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
17809                     Optional => 1,
17810                     Each_Line_Handler => \&filter_unihan_line,
17811                     ),
17812     Input_file->new('UnihanReadings.txt', v5.2.0,
17813                     Optional => 1,
17814                     Each_Line_Handler => \&filter_unihan_line,
17815                     ),
17816     Input_file->new('UnihanVariants.txt', v5.2.0,
17817                     Optional => 1,
17818                     Each_Line_Handler => \&filter_unihan_line,
17819                     ),
17820     Input_file->new('ScriptExtensions.txt', v6.0.0,
17821                     Property => 'Script_Extensions',
17822                     Pre_Handler => \&setup_script_extensions,
17823                     Each_Line_Handler => \&filter_script_extensions_line,
17824                     Has_Missings_Defaults => (($v_version le v6.0.0)
17825                                             ? $NO_DEFAULTS
17826                                             : $IGNORED),
17827                     ),
17828     # The two Indic files are actually available starting in v6.0.0, but their
17829     # property values are missing from PropValueAliases.txt in that release,
17830     # so that further work would have to be done to get them to work properly
17831     # for that release.
17832     Input_file->new('IndicMatraCategory.txt', v6.1.0,
17833                     Property => 'Indic_Matra_Category',
17834                     Has_Missings_Defaults => $NOT_IGNORED,
17835                     Skip => "Provisional; for the analysis and processing of Indic scripts",
17836                     ),
17837     Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
17838                     Property => 'Indic_Syllabic_Category',
17839                     Has_Missings_Defaults => $NOT_IGNORED,
17840                     Skip => "Provisional; for the analysis and processing of Indic scripts",
17841                     ),
17842     Input_file->new('BidiBrackets.txt', v6.3.0,
17843                     Properties => [ 'Bidi_Paired_Bracket', 'Bidi_Paired_Bracket_Type' ],
17844                     Has_Missings_Defaults => $NO_DEFAULTS,
17845                     ),
17846     Input_file->new("BidiCharacterTest.txt", v6.3.0,
17847                     Skip => 'Validation Tests',
17848                     ),
17849 );
17850
17851 # End of all the preliminaries.
17852 # Do it...
17853
17854 if ($compare_versions) {
17855     Carp::my_carp(<<END
17856 Warning.  \$compare_versions is set.  Output is not suitable for production
17857 END
17858     );
17859 }
17860
17861 # Put into %potential_files a list of all the files in the directory structure
17862 # that could be inputs to this program, excluding those that we should ignore.
17863 # Use absolute file names because it makes it easier across machine types.
17864 my @ignored_files_full_names = map { File::Spec->rel2abs(
17865                                      internal_file_to_platform($_))
17866                                 } keys %ignored_files;
17867 File::Find::find({
17868     wanted=>sub {
17869         return unless /\.txt$/i;  # Some platforms change the name's case
17870         my $full = lc(File::Spec->rel2abs($_));
17871         $potential_files{$full} = 1
17872                     if ! grep { $full eq lc($_) } @ignored_files_full_names;
17873         return;
17874     }
17875 }, File::Spec->curdir());
17876
17877 my @mktables_list_output_files;
17878 my $old_start_time = 0;
17879 my $old_options = "";
17880
17881 if (! -e $file_list) {
17882     print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
17883     $write_unchanged_files = 1;
17884 } elsif ($write_unchanged_files) {
17885     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
17886 }
17887 else {
17888     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
17889     my $file_handle;
17890     if (! open $file_handle, "<", $file_list) {
17891         Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
17892         $glob_list = 1;
17893     }
17894     else {
17895         my @input;
17896
17897         # Read and parse mktables.lst, placing the results from the first part
17898         # into @input, and the second part into @mktables_list_output_files
17899         for my $list ( \@input, \@mktables_list_output_files ) {
17900             while (<$file_handle>) {
17901                 s/^ \s+ | \s+ $//xg;
17902                 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
17903                     $old_start_time = $1;
17904                     next;
17905                 }
17906                 if (/^ \s* \# \s* From\ options\ (.+) /x) {
17907                     $old_options = $1;
17908                     next;
17909                 }
17910                 next if /^ \s* (?: \# .* )? $/x;
17911                 last if /^ =+ $/x;
17912                 my ( $file ) = split /\t/;
17913                 push @$list, $file;
17914             }
17915             @$list = uniques(@$list);
17916             next;
17917         }
17918
17919         # Look through all the input files
17920         foreach my $input (@input) {
17921             next if $input eq 'version'; # Already have checked this.
17922
17923             # Ignore if doesn't exist.  The checking about whether we care or
17924             # not is done via the Input_file object.
17925             next if ! file_exists($input);
17926
17927             # The paths are stored with relative names, and with '/' as the
17928             # delimiter; convert to absolute on this machine
17929             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
17930             $potential_files{lc $full} = 1
17931                 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
17932         }
17933     }
17934
17935     close $file_handle;
17936 }
17937
17938 if ($glob_list) {
17939
17940     # Here wants to process all .txt files in the directory structure.
17941     # Convert them to full path names.  They are stored in the platform's
17942     # relative style
17943     my @known_files;
17944     foreach my $object (@input_file_objects) {
17945         my $file = $object->file;
17946         next unless defined $file;
17947         push @known_files, File::Spec->rel2abs($file);
17948     }
17949
17950     my @unknown_input_files;
17951     foreach my $file (keys %potential_files) {  # The keys are stored in lc
17952         next if grep { $file eq lc($_) } @known_files;
17953
17954         # Here, the file is unknown to us.  Get relative path name
17955         $file = File::Spec->abs2rel($file);
17956         push @unknown_input_files, $file;
17957
17958         # What will happen is we create a data structure for it, and add it to
17959         # the list of input files to process.  First get the subdirectories
17960         # into an array
17961         my (undef, $directories, undef) = File::Spec->splitpath($file);
17962         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
17963         my @directories = File::Spec->splitdir($directories);
17964
17965         # If the file isn't extracted (meaning none of the directories is the
17966         # extracted one), just add it to the end of the list of inputs.
17967         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
17968             push @input_file_objects, Input_file->new($file, v0);
17969         }
17970         else {
17971
17972             # Here, the file is extracted.  It needs to go ahead of most other
17973             # processing.  Search for the first input file that isn't a
17974             # special required property (that is, find one whose first_release
17975             # is non-0), and isn't extracted.  Also, the Age property file is
17976             # processed before the extracted ones, just in case
17977             # $compare_versions is set.
17978             for (my $i = 0; $i < @input_file_objects; $i++) {
17979                 if ($input_file_objects[$i]->first_released ne v0
17980                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
17981                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
17982                 {
17983                     splice @input_file_objects, $i, 0,
17984                                                 Input_file->new($file, v0);
17985                     last;
17986                 }
17987             }
17988
17989         }
17990     }
17991     if (@unknown_input_files) {
17992         print STDERR simple_fold(join_lines(<<END
17993
17994 The following files are unknown as to how to handle.  Assuming they are
17995 typical property files.  You'll know by later error messages if it worked or
17996 not:
17997 END
17998         ) . " " . join(", ", @unknown_input_files) . "\n\n");
17999     }
18000 } # End of looking through directory structure for more .txt files.
18001
18002 # Create the list of input files from the objects we have defined, plus
18003 # version
18004 my @input_files = qw(version Makefile);
18005 foreach my $object (@input_file_objects) {
18006     my $file = $object->file;
18007     next if ! defined $file;    # Not all objects have files
18008     next if $object->optional && ! -e $file;
18009     push @input_files,  $file;
18010 }
18011
18012 if ( $verbosity >= $VERBOSE ) {
18013     print "Expecting ".scalar( @input_files )." input files. ",
18014          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
18015 }
18016
18017 # We set $most_recent to be the most recently changed input file, including
18018 # this program itself (done much earlier in this file)
18019 foreach my $in (@input_files) {
18020     next unless -e $in;        # Keep going even if missing a file
18021     my $mod_time = (stat $in)[9];
18022     $most_recent = $mod_time if $mod_time > $most_recent;
18023
18024     # See that the input files have distinct names, to warn someone if they
18025     # are adding a new one
18026     if ($make_list) {
18027         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
18028         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
18029         my @directories = File::Spec->splitdir($directories);
18030         my $base = $file =~ s/\.txt$//;
18031         construct_filename($file, 'mutable', \@directories);
18032     }
18033 }
18034
18035 # We use 'Makefile' just to see if it has changed since the last time we
18036 # rebuilt.  Now discard it.
18037 @input_files = grep { $_ ne 'Makefile' } @input_files;
18038
18039 my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
18040               || ! scalar @mktables_list_output_files  # or if no outputs known
18041               || $old_start_time < $most_recent        # or out-of-date
18042               || $old_options ne $command_line_arguments; # or with different
18043                                                           # options
18044
18045 # Now we check to see if any output files are older than youngest, if
18046 # they are, we need to continue on, otherwise we can presumably bail.
18047 if (! $rebuild) {
18048     foreach my $out (@mktables_list_output_files) {
18049         if ( ! file_exists($out)) {
18050             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
18051             $rebuild = 1;
18052             last;
18053          }
18054         #local $to_trace = 1 if main::DEBUG;
18055         trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
18056         if ( (stat $out)[9] <= $most_recent ) {
18057             #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
18058             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
18059             $rebuild = 1;
18060             last;
18061         }
18062     }
18063 }
18064 if (! $rebuild) {
18065     print "Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
18066     exit(0);
18067 }
18068 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
18069
18070 # Ready to do the major processing.  First create the perl pseudo-property.
18071 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
18072
18073 # Process each input file
18074 foreach my $file (@input_file_objects) {
18075     $file->run;
18076 }
18077
18078 # Finish the table generation.
18079
18080 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
18081 finish_Unicode();
18082
18083 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
18084 compile_perl();
18085
18086 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
18087 add_perl_synonyms();
18088
18089 print "Writing tables\n" if $verbosity >= $PROGRESS;
18090 write_all_tables();
18091
18092 # Write mktables.lst
18093 if ( $file_list and $make_list ) {
18094
18095     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
18096     foreach my $file (@input_files, @files_actually_output) {
18097         my (undef, $directories, $file) = File::Spec->splitpath($file);
18098         my @directories = File::Spec->splitdir($directories);
18099         $file = join '/', @directories, $file;
18100     }
18101
18102     my $ofh;
18103     if (! open $ofh,">",$file_list) {
18104         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
18105         return
18106     }
18107     else {
18108         my $localtime = localtime $start_time;
18109         print $ofh <<"END";
18110 #
18111 # $file_list -- File list for $0.
18112 #
18113 #   Autogenerated starting on $start_time ($localtime)
18114 #   From options $command_line_arguments
18115 #
18116 # - First section is input files
18117 #   ($0 itself is not listed but is automatically considered an input)
18118 # - Section separator is /^=+\$/
18119 # - Second section is a list of output files.
18120 # - Lines matching /^\\s*#/ are treated as comments
18121 #   which along with blank lines are ignored.
18122 #
18123
18124 # Input files:
18125
18126 END
18127         print $ofh "$_\n" for sort(@input_files);
18128         print $ofh "\n=================================\n# Output files:\n\n";
18129         print $ofh "$_\n" for sort @files_actually_output;
18130         print $ofh "\n# ",scalar(@input_files)," input files\n",
18131                 "# ",scalar(@files_actually_output)+1," output files\n\n",
18132                 "# End list\n";
18133         close $ofh
18134             or Carp::my_carp("Failed to close $ofh: $!");
18135
18136         print "Filelist has ",scalar(@input_files)," input files and ",
18137             scalar(@files_actually_output)+1," output files\n"
18138             if $verbosity >= $VERBOSE;
18139     }
18140 }
18141
18142 # Output these warnings unless -q explicitly specified.
18143 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
18144     if (@unhandled_properties) {
18145         print "\nProperties and tables that unexpectedly have no code points\n";
18146         foreach my $property (sort @unhandled_properties) {
18147             print $property, "\n";
18148         }
18149     }
18150
18151     if (%potential_files) {
18152         print "\nInput files that are not considered:\n";
18153         foreach my $file (sort keys %potential_files) {
18154             print File::Spec->abs2rel($file), "\n";
18155         }
18156     }
18157     print "\nAll done\n" if $verbosity >= $VERBOSE;
18158 }
18159 exit(0);
18160
18161 # TRAILING CODE IS USED BY make_property_test_script()
18162 __DATA__
18163
18164 use strict;
18165 use warnings;
18166
18167 # If run outside the normal test suite on an ASCII platform, you can
18168 # just create a latin1_to_native() function that just returns its
18169 # inputs, because that's the only function used from test.pl
18170 require "test.pl";
18171
18172 # Test qr/\X/ and the \p{} regular expression constructs.  This file is
18173 # constructed by mktables from the tables it generates, so if mktables is
18174 # buggy, this won't necessarily catch those bugs.  Tests are generated for all
18175 # feasible properties; a few aren't currently feasible; see
18176 # is_code_point_usable() in mktables for details.
18177
18178 # Standard test packages are not used because this manipulates SIG_WARN.  It
18179 # exits 0 if every non-skipped test succeeded; -1 if any failed.
18180
18181 my $Tests = 0;
18182 my $Fails = 0;
18183
18184 sub Expect($$$$) {
18185     my $expected = shift;
18186     my $ord = shift;
18187     my $regex  = shift;
18188     my $warning_type = shift;   # Type of warning message, like 'deprecated'
18189                                 # or empty if none
18190     my $line   = (caller)[2];
18191
18192     # Convert the code point to hex form
18193     my $string = sprintf "\"\\x{%04X}\"", $ord;
18194
18195     my @tests = "";
18196
18197     # The first time through, use all warnings.  If the input should generate
18198     # a warning, add another time through with them turned off
18199     push @tests, "no warnings '$warning_type';" if $warning_type;
18200
18201     foreach my $no_warnings (@tests) {
18202
18203         # Store any warning messages instead of outputting them
18204         local $SIG{__WARN__} = $SIG{__WARN__};
18205         my $warning_message;
18206         $SIG{__WARN__} = sub { $warning_message = $_[0] };
18207
18208         $Tests++;
18209
18210         # A string eval is needed because of the 'no warnings'.
18211         # Assumes no parens in the regular expression
18212         my $result = eval "$no_warnings
18213                             my \$RegObj = qr($regex);
18214                             $string =~ \$RegObj ? 1 : 0";
18215         if (not defined $result) {
18216             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
18217             $Fails++;
18218         }
18219         elsif ($result ^ $expected) {
18220             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
18221             $Fails++;
18222         }
18223         elsif ($warning_message) {
18224             if (! $warning_type || ($warning_type && $no_warnings)) {
18225                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
18226                 $Fails++;
18227             }
18228             else {
18229                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
18230             }
18231         }
18232         elsif ($warning_type && ! $no_warnings) {
18233             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
18234             $Fails++;
18235         }
18236         else {
18237             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
18238         }
18239     }
18240     return;
18241 }
18242
18243 sub Error($) {
18244     my $regex  = shift;
18245     $Tests++;
18246     if (eval { 'x' =~ qr/$regex/; 1 }) {
18247         $Fails++;
18248         my $line = (caller)[2];
18249         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
18250     }
18251     else {
18252         my $line = (caller)[2];
18253         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
18254     }
18255     return;
18256 }
18257
18258 # GCBTest.txt character that separates grapheme clusters
18259 my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
18260 utf8::upgrade($breakable_utf8);
18261
18262 # GCBTest.txt character that indicates that the adjoining code points are part
18263 # of the same grapheme cluster
18264 my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
18265 utf8::upgrade($nobreak_utf8);
18266
18267 sub Test_X($) {
18268     # Test qr/\X/ matches.  The input is a line from auxiliary/GCBTest.txt
18269     # Each such line is a sequence of code points given by their hex numbers,
18270     # separated by the two characters defined just before this subroutine that
18271     # indicate that either there can or cannot be a break between the adjacent
18272     # code points.  If there isn't a break, that means the sequence forms an
18273     # extended grapheme cluster, which means that \X should match the whole
18274     # thing.  If there is a break, \X should stop there.  This is all
18275     # converted by this routine into a match:
18276     #   $string =~ /(\X)/,
18277     # Each \X should match the next cluster; and that is what is checked.
18278
18279     my $template = shift;
18280
18281     my $line   = (caller)[2];
18282
18283     # The line contains characters above the ASCII range, but in Latin1.  It
18284     # may or may not be in utf8, and if it is, it may or may not know it.  So,
18285     # convert these characters to 8 bits.  If knows is in utf8, simply
18286     # downgrade.
18287     if (utf8::is_utf8($template)) {
18288         utf8::downgrade($template);
18289     } else {
18290
18291         # Otherwise, if it is in utf8, but doesn't know it, the next lines
18292         # convert the two problematic characters to their 8-bit equivalents.
18293         # If it isn't in utf8, they don't harm anything.
18294         use bytes;
18295         $template =~ s/$nobreak_utf8/$nobreak/g;
18296         $template =~ s/$breakable_utf8/$breakable/g;
18297     }
18298
18299     # Get rid of the leading and trailing breakables
18300     $template =~ s/^ \s* $breakable \s* //x;
18301     $template =~ s/ \s* $breakable \s* $ //x;
18302
18303     # And no-breaks become just a space.
18304     $template =~ s/ \s* $nobreak \s* / /xg;
18305
18306     # Split the input into segments that are breakable between them.
18307     my @segments = split /\s*$breakable\s*/, $template;
18308
18309     my $string = "";
18310     my $display_string = "";
18311     my @should_match;
18312     my @should_display;
18313
18314     # Convert the code point sequence in each segment into a Perl string of
18315     # characters
18316     foreach my $segment (@segments) {
18317         my @code_points = split /\s+/, $segment;
18318         my $this_string = "";
18319         my $this_display = "";
18320         foreach my $code_point (@code_points) {
18321             $this_string .= latin1_to_native(chr(hex $code_point));
18322             $this_display .= "\\x{$code_point}";
18323         }
18324
18325         # The next cluster should match the string in this segment.
18326         push @should_match, $this_string;
18327         push @should_display, $this_display;
18328         $string .= $this_string;
18329         $display_string .= $this_display;
18330     }
18331
18332     # If a string can be represented in both non-ut8 and utf8, test both cases
18333     UPGRADE:
18334     for my $to_upgrade (0 .. 1) {
18335
18336         if ($to_upgrade) {
18337
18338             # If already in utf8, would just be a repeat
18339             next UPGRADE if utf8::is_utf8($string);
18340
18341             utf8::upgrade($string);
18342         }
18343
18344         # Finally, do the \X match.
18345         my @matches = $string =~ /(\X)/g;
18346
18347         # Look through each matched cluster to verify that it matches what we
18348         # expect.
18349         my $min = (@matches < @should_match) ? @matches : @should_match;
18350         for my $i (0 .. $min - 1) {
18351             $Tests++;
18352             if ($matches[$i] eq $should_match[$i]) {
18353                 print "ok $Tests - ";
18354                 if ($i == 0) {
18355                     print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
18356                 } else {
18357                     print "And \\X #", $i + 1,
18358                 }
18359                 print " correctly matched $should_display[$i]; line $line\n";
18360             } else {
18361                 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
18362                                                     unpack("U*", $matches[$i]));
18363                 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
18364                     $i + 1,
18365                     " should have matched $should_display[$i]",
18366                     " but instead matched $matches[$i]",
18367                     ".  Abandoning rest of line $line\n";
18368                 next UPGRADE;
18369             }
18370         }
18371
18372         # And the number of matches should equal the number of expected matches.
18373         $Tests++;
18374         if (@matches == @should_match) {
18375             print "ok $Tests - Nothing was left over; line $line\n";
18376         } else {
18377             print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
18378         }
18379     }
18380
18381     return;
18382 }
18383
18384 sub Finished() {
18385     print "1..$Tests\n";
18386     exit($Fails ? -1 : 0);
18387 }
18388
18389 Error('\p{Script=InGreek}');    # Bug #69018
18390 Test_X("1100 $nobreak 1161");  # Bug #70940
18391 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18392 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18393 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726