lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # Needs 'no overloading' to run faster on miniperl.  Code commented out at the
   8 # subroutine objaddr can be used instead to work as far back (untested) as
   9 # 5.8: needs pack "U".  But almost all occurrences of objaddr have been
  10 # removed in favor of using 'no overloading'.  You also would have to go
  11 # through and replace occurrences like:
  12 #       my $addr = do { no overloading; pack 'J', $self; }
  13 # with
  14 #       my $addr = main::objaddr $self;
  15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
  16 # that instituted the change to main::objaddr, and subsequent commits that
  17 # changed 0+$self to pack 'J', $self.)
  18
  19 my $start_time;
  20 BEGIN { # Get the time the script started running; do it at compilation to
  21         # get it as close as possible
  22     $start_time= time;
  23 }
  24
  25
  26 require 5.010_001;
  27 use strict;
  28 use warnings;
  29 use Carp;
  30 use Config;
  31 use File::Find;
  32 use File::Path;
  33 use File::Spec;
  34 use Text::Tabs;
  35
  36 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  38
  39 ##########################################################################
  40 #
  41 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  42 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  43 # a pod file and a .t file
  44 #
  45 # The structure of this file is:
  46 #   First these introductory comments; then
  47 #   code needed for everywhere, such as debugging stuff; then
  48 #   code to handle input parameters; then
  49 #   data structures likely to be of external interest (some of which depend on
  50 #       the input parameters, so follows them; then
  51 #   more data structures and subroutine and package (class) definitions; then
  52 #   the small actual loop to process the input files and finish up; then
  53 #   a __DATA__ section, for the .t tests
  54 #
  55 # This program works on all releases of Unicode through at least 6.0.  The
  56 # outputs have been scrutinized most intently for release 5.1.  The others
  57 # have been checked for somewhat more than just sanity.  It can handle all
  58 # existing Unicode character properties in those releases.
  59 #
  60 # This program is mostly about Unicode character (or code point) properties.
  61 # A property describes some attribute or quality of a code point, like if it
  62 # is lowercase or not, its name, what version of Unicode it was first defined
  63 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  64 # possibilities by making all properties into mappings from each code point
  65 # into some corresponding value.  In the case of it being lowercase or not,
  66 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  67 # property maps each Unicode code point to a single value, called a "property
  68 # value".  (Hence each Unicode property is a true mathematical function with
  69 # exactly one value per code point.)
  70 #
  71 # When using a property in a regular expression, what is desired isn't the
  72 # mapping of the code point to its property's value, but the reverse (or the
  73 # mathematical "inverse relation"): starting with the property value, "Does a
  74 # code point map to it?"  These are written in a "compound" form:
  75 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  76 # files containing the lists of code points that map to each such regular
  77 # expression property value, one file per list
  78 #
  79 # There is also a single form shortcut that Perl adds for many of the commonly
  80 # used properties.  This happens for all binary properties, plus script,
  81 # general_category, and block properties.
  82 #
  83 # Thus the outputs of this program are files.  There are map files, mostly in
  84 # the 'To' directory; and there are list files for use in regular expression
  85 # matching, all in subdirectories of the 'lib' directory, with each
  86 # subdirectory being named for the property that the lists in it are for.
  87 # Bookkeeping, test, and documentation files are also generated.
  88
  89 my $matches_directory = 'lib';   # Where match (\p{}) files go.
  90 my $map_directory = 'To';        # Where map files go.
  91
  92 # DATA STRUCTURES
  93 #
  94 # The major data structures of this program are Property, of course, but also
  95 # Table.  There are two kinds of tables, very similar to each other.
  96 # "Match_Table" is the data structure giving the list of code points that have
  97 # a particular property value, mentioned above.  There is also a "Map_Table"
  98 # data structure which gives the property's mapping from code point to value.
  99 # There are two structures because the match tables need to be combined in
 100 # various ways, such as constructing unions, intersections, complements, etc.,
 101 # and the map ones don't.  And there would be problems, perhaps subtle, if
 102 # a map table were inadvertently operated on in some of those ways.
 103 # The use of separate classes with operations defined on one but not the other
 104 # prevents accidentally confusing the two.
 105 #
 106 # At the heart of each table's data structure is a "Range_List", which is just
 107 # an ordered list of "Ranges", plus ancillary information, and methods to
 108 # operate on them.  A Range is a compact way to store property information.
 109 # Each range has a starting code point, an ending code point, and a value that
 110 # is meant to apply to all the code points between the two end points,
 111 # inclusive.  For a map table, this value is the property value for those
 112 # code points.  Two such ranges could be written like this:
 113 #   0x41 .. 0x5A, 'Upper',
 114 #   0x61 .. 0x7A, 'Lower'
 115 #
 116 # Each range also has a type used as a convenience to classify the values.
 117 # Most ranges in this program will be Type 0, or normal, but there are some
 118 # ranges that have a non-zero type.  These are used only in map tables, and
 119 # are for mappings that don't fit into the normal scheme of things.  Mappings
 120 # that require a hash entry to communicate with utf8.c are one example;
 121 # another example is mappings for charnames.pm to use which indicate a name
 122 # that is algorithmically determinable from its code point (and vice-versa).
 123 # These are used to significantly compact these tables, instead of listing
 124 # each one of the tens of thousands individually.
 125 #
 126 # In a match table, the value of a range is irrelevant (and hence the type as
 127 # well, which will always be 0), and arbitrarily set to the null string.
 128 # Using the example above, there would be two match tables for those two
 129 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 130 # named Lower would contain 0x61..0x7A.
 131 #
 132 # Actually, there are two types of range lists, "Range_Map" is the one
 133 # associated with map tables, and "Range_List" with match tables.
 134 # Again, this is so that methods can be defined on one and not the other so as
 135 # to prevent operating on them in incorrect ways.
 136 #
 137 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 138 # in the perl core.  All tables could in theory be written, but some are
 139 # suppressed because there is no current practical use for them.  It is easy
 140 # to change which get written by changing various lists that are near the top
 141 # of the actual code in this file.  The table data structures contain enough
 142 # ancillary information to allow them to be treated as separate entities for
 143 # writing, such as the path to each one's file.  There is a heading in each
 144 # map table that gives the format of its entries, and what the map is for all
 145 # the code points missing from it.  (This allows tables to be more compact.)
 146 #
 147 # The Property data structure contains one or more tables.  All properties
 148 # contain a map table (except the $perl property which is a
 149 # pseudo-property containing only match tables), and any properties that
 150 # are usable in regular expression matches also contain various matching
 151 # tables, one for each value the property can have.  A binary property can
 152 # have two values, True and False (or Y and N, which are preferred by Unicode
 153 # terminology).  Thus each of these properties will have a map table that
 154 # takes every code point and maps it to Y or N (but having ranges cuts the
 155 # number of entries in that table way down), and two match tables, one
 156 # which has a list of all the code points that map to Y, and one for all the
 157 # code points that map to N.  (For each of these, a third table is also
 158 # generated for the pseudo Perl property.  It contains the identical code
 159 # points as the Y table, but can be written, not in the compound form, but in
 160 # a "single" form like \p{IsUppercase}.)  Many properties are binary, but some
 161 # properties have several possible values, some have many, and properties like
 162 # Name have a different value for every named code point.  Those will not,
 163 # unless the controlling lists are changed, have their match tables written
 164 # out.  But all the ones which can be used in regular expression \p{} and \P{}
 165 # constructs will.  Generally a property will have either its map table or its
 166 # match tables written but not both.  Again, what gets written is controlled
 167 # by lists which can easily be changed.  Properties have a 'Type', like
 168 # binary, or string, or enum depending on how many match tables there are and
 169 # the content of the maps.  This 'Type' is different than a range 'Type', so
 170 # don't get confused by the two concepts having the same name.
 171 #
 172 # For information about the Unicode properties, see Unicode's UAX44 document:
 173
 174 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 175
 176 # As stated earlier, this program will work on any release of Unicode so far.
 177 # Most obvious problems in earlier data have NOT been corrected except when
 178 # necessary to make Perl or this program work reasonably.  For example, no
 179 # folding information was given in early releases, so this program uses the
 180 # substitute of lower case, just so that a regular expression with the /i
 181 # option will do something that actually gives the right results in many
 182 # cases.  There are also a couple other corrections for version 1.1.5,
 183 # commented at the point they are made.  As an example of corrections that
 184 # weren't made (but could be) is this statement from DerivedAge.txt: "The
 185 # supplementary private use code points and the non-character code points were
 186 # assigned in version 2.0, but not specifically listed in the UCD until
 187 # versions 3.0 and 3.1 respectively."  (To be precise it was 3.0.1 not 3.0.0)
 188 # More information on Unicode version glitches is further down in these
 189 # introductory comments.
 190 #
 191 # This program works on all non-provisional properties as of 6.0, though the
 192 # files for some are suppressed from apparent lack of demand for them.  You
 193 # can change which are output by changing lists in this program.
 194 #
 195 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
 196 # loose matchings rules (from Unicode TR18):
 197 #
 198 #    The recommended names for UCD properties and property values are in
 199 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 200 #    [PropValue]. There are both abbreviated names and longer, more
 201 #    descriptive names. It is strongly recommended that both names be
 202 #    recognized, and that loose matching of property names be used,
 203 #    whereby the case distinctions, whitespace, hyphens, and underbar
 204 #    are ignored.
 205 # The program still allows Fuzzy to override its determination of if loose
 206 # matching should be used, but it isn't currently used, as it is no longer
 207 # needed; the calculations it makes are good enough.
 208 #
 209 # SUMMARY OF HOW IT WORKS:
 210 #
 211 #   Process arguments
 212 #
 213 #   A list is constructed containing each input file that is to be processed
 214 #
 215 #   Each file on the list is processed in a loop, using the associated handler
 216 #   code for each:
 217 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 218 #            first.  These files name the properties and property values.
 219 #            Objects are created of all the property and property value names
 220 #            that the rest of the input should expect, including all synonyms.
 221 #        The other input files give mappings from properties to property
 222 #           values.  That is, they list code points and say what the mapping
 223 #           is under the given property.  Some files give the mappings for
 224 #           just one property; and some for many.  This program goes through
 225 #           each file and populates the properties from them.  Some properties
 226 #           are listed in more than one file, and Unicode has set up a
 227 #           precedence as to which has priority if there is a conflict.  Thus
 228 #           the order of processing matters, and this program handles the
 229 #           conflict possibility by processing the overriding input files
 230 #           last, so that if necessary they replace earlier values.
 231 #        After this is all done, the program creates the property mappings not
 232 #            furnished by Unicode, but derivable from what it does give.
 233 #        The tables of code points that match each property value in each
 234 #            property that is accessible by regular expressions are created.
 235 #        The Perl-defined properties are created and populated.  Many of these
 236 #            require data determined from the earlier steps
 237 #        Any Perl-defined synonyms are created, and name clashes between Perl
 238 #            and Unicode are reconciled and warned about.
 239 #        All the properties are written to files
 240 #        Any other files are written, and final warnings issued.
 241 #
 242 # For clarity, a number of operators have been overloaded to work on tables:
 243 #   ~ means invert (take all characters not in the set).  The more
 244 #       conventional '!' is not used because of the possibility of confusing
 245 #       it with the actual boolean operation.
 246 #   + means union
 247 #   - means subtraction
 248 #   & means intersection
 249 # The precedence of these is the order listed.  Parentheses should be
 250 # copiously used.  These are not a general scheme.  The operations aren't
 251 # defined for a number of things, deliberately, to avoid getting into trouble.
 252 # Operations are done on references and affect the underlying structures, so
 253 # that the copy constructors for them have been overloaded to not return a new
 254 # clone, but the input object itself.
 255 #
 256 # The bool operator is deliberately not overloaded to avoid confusion with
 257 # "should it mean if the object merely exists, or also is non-empty?".
 258 #
 259 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 260 #
 261 # This program needs to be able to run under miniperl.  Therefore, it uses a
 262 # minimum of other modules, and hence implements some things itself that could
 263 # be gotten from CPAN
 264 #
 265 # This program uses inputs published by the Unicode Consortium.  These can
 266 # change incompatibly between releases without the Perl maintainers realizing
 267 # it.  Therefore this program is now designed to try to flag these.  It looks
 268 # at the directories where the inputs are, and flags any unrecognized files.
 269 # It keeps track of all the properties in the files it handles, and flags any
 270 # that it doesn't know how to handle.  It also flags any input lines that
 271 # don't match the expected syntax, among other checks.
 272 #
 273 # It is also designed so if a new input file matches one of the known
 274 # templates, one hopefully just needs to add it to a list to have it
 275 # processed.
 276 #
 277 # As mentioned earlier, some properties are given in more than one file.  In
 278 # particular, the files in the extracted directory are supposedly just
 279 # reformattings of the others.  But they contain information not easily
 280 # derivable from the other files, including results for Unihan, which this
 281 # program doesn't ordinarily look at, and for unassigned code points.  They
 282 # also have historically had errors or been incomplete.  In an attempt to
 283 # create the best possible data, this program thus processes them first to
 284 # glean information missing from the other files; then processes those other
 285 # files to override any errors in the extracted ones.  Much of the design was
 286 # driven by this need to store things and then possibly override them.
 287 #
 288 # It tries to keep fatal errors to a minimum, to generate something usable for
 289 # testing purposes.  It always looks for files that could be inputs, and will
 290 # warn about any that it doesn't know how to handle (the -q option suppresses
 291 # the warning).
 292 #
 293 # Why have files written out for binary 'N' matches?
 294 #   For binary properties, if you know the mapping for either Y or N; the
 295 #   other is trivial to construct, so could be done at Perl run-time by just
 296 #   complementing the result, instead of having a file for it.  That is, if
 297 #   someone types in \p{foo: N}, Perl could translate that to \P{foo: Y} and
 298 #   not need a file.   The problem is communicating to Perl that a given
 299 #   property is binary.  Perl can't figure it out from looking at the N (or
 300 #   No), as some non-binary properties have these as property values.  So
 301 #   rather than inventing a way to communicate this info back to the core,
 302 #   which would have required changes there as well, it was simpler just to
 303 #   add the extra tables.
 304 #
 305 # Why is there more than one type of range?
 306 #   This simplified things.  There are some very specialized code points that
 307 #   have to be handled specially for output, such as Hangul syllable names.
 308 #   By creating a range type (done late in the development process), it
 309 #   allowed this to be stored with the range, and overridden by other input.
 310 #   Originally these were stored in another data structure, and it became a
 311 #   mess trying to decide if a second file that was for the same property was
 312 #   overriding the earlier one or not.
 313 #
 314 # Why are there two kinds of tables, match and map?
 315 #   (And there is a base class shared by the two as well.)  As stated above,
 316 #   they actually are for different things.  Development proceeded much more
 317 #   smoothly when I (khw) realized the distinction.  Map tables are used to
 318 #   give the property value for every code point (actually every code point
 319 #   that doesn't map to a default value).  Match tables are used for regular
 320 #   expression matches, and are essentially the inverse mapping.  Separating
 321 #   the two allows more specialized methods, and error checks so that one
 322 #   can't just take the intersection of two map tables, for example, as that
 323 #   is nonsensical.
 324 #
 325 # There are no match tables generated for matches of the null string.  These
 326 # would look like qr/\p{JSN=}/ currently without modifying the regex code.
 327 # Perhaps something like them could be added if necessary.  The JSN does have
 328 # a real code point U+110B that maps to the null string, but it is a
 329 # contributory property, and therefore not output by default.  And it's easily
 330 # handled so far by making the null string the default where it is a
 331 # possibility.
 332 #
 333 # DEBUGGING
 334 #
 335 # This program is written so it will run under miniperl.  Occasionally changes
 336 # will cause an error where the backtrace doesn't work well under miniperl.
 337 # To diagnose the problem, you can instead run it under regular perl, if you
 338 # have one compiled.
 339 #
 340 # There is a good trace facility.  To enable it, first sub DEBUG must be set
 341 # to return true.  Then a line like
 342 #
 343 # local $to_trace = 1 if main::DEBUG;
 344 #
 345 # can be added to enable tracing in its lexical scope or until you insert
 346 # another line:
 347 #
 348 # local $to_trace = 0 if main::DEBUG;
 349 #
 350 # then use a line like "trace $a, @b, %c, ...;
 351 #
 352 # Some of the more complex subroutines already have trace statements in them.
 353 # Permanent trace statements should be like:
 354 #
 355 # trace ... if main::DEBUG && $to_trace;
 356 #
 357 # If there is just one or a few files that you're debugging, you can easily
 358 # cause most everything else to be skipped.  Change the line
 359 #
 360 # my $debug_skip = 0;
 361 #
 362 # to 1, and every file whose object is in @input_file_objects and doesn't have
 363 # a, 'non_skip => 1,' in its constructor will be skipped.
 364 #
 365 # To compare the output tables, it may be useful to specify the -annotate
 366 # flag.  This causes the tables to expand so there is one entry for each
 367 # non-algorithmically named code point giving, currently its name, and its
 368 # graphic representation if printable (and you have a font that knows about
 369 # it).  This makes it easier to see what the particular code points are in
 370 # each output table.  The tables are usable, but because they don't have
 371 # ranges (for the most part), a Perl using them will run slower.  Non-named
 372 # code points are annotated with a description of their status, and contiguous
 373 # ones with the same description will be output as a range rather than
 374 # individually.  Algorithmically named characters are also output as ranges,
 375 # except when there are just a few contiguous ones.
 376 #
 377 # FUTURE ISSUES
 378 #
 379 # The program would break if Unicode were to change its names so that
 380 # interior white space, underscores, or dashes differences were significant
 381 # within property and property value names.
 382 #
 383 # It might be easier to use the xml versions of the UCD if this program ever
 384 # would need heavy revision, and the ability to handle old versions was not
 385 # required.
 386 #
 387 # There is the potential for name collisions, in that Perl has chosen names
 388 # that Unicode could decide it also likes.  There have been such collisions in
 389 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 390 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 391 # collisions, which were withdrawn before the final release, because of Perl's
 392 # and other's protests.  These all involved new properties which began with
 393 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 394 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 395 # Unicode document, so they are unlikely to be used by Unicode for another
 396 # purpose.  However, they might try something beginning with 'In', or use any
 397 # of the other Perl-defined properties.  This program will warn you of name
 398 # collisions, and refuse to generate tables with them, but manual intervention
 399 # will be required in this event.  One scheme that could be implemented, if
 400 # necessary, would be to have this program generate another file, or add a
 401 # field to mktables.lst that gives the date of first definition of a property.
 402 # Each new release of Unicode would use that file as a basis for the next
 403 # iteration.  And the Perl synonym addition code could sort based on the age
 404 # of the property, so older properties get priority, and newer ones that clash
 405 # would be refused; hence existing code would not be impacted, and some other
 406 # synonym would have to be used for the new property.  This is ugly, and
 407 # manual intervention would certainly be easier to do in the short run; lets
 408 # hope it never comes to this.
 409 #
 410 # A NOTE ON UNIHAN
 411 #
 412 # This program can generate tables from the Unihan database.  But it doesn't
 413 # by default, letting the CPAN module Unicode::Unihan handle them.  Prior to
 414 # version 5.2, this database was in a single file, Unihan.txt.  In 5.2 the
 415 # database was split into 8 different files, all beginning with the letters
 416 # 'Unihan'.  This program will read those file(s) if present, but it needs to
 417 # know which of the many properties in the file(s) should have tables created
 418 # for them.  It will create tables for any properties listed in
 419 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
 420 # @cjk_properties array and the @cjk_property_values array.  Thus, if a
 421 # property you want is not in those files of the release you are building
 422 # against, you must add it to those two arrays.  Starting in 4.0, the
 423 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
 424 # is present in the directory, a table will be generated for that property.
 425 # In 5.2, several more properties were added.  For your convenience, the two
 426 # arrays are initialized with all the 6.0 listed properties that are also in
 427 # earlier releases.  But these are commented out.  You can just uncomment the
 428 # ones you want, or use them as a template for adding entries for other
 429 # properties.
 430 #
 431 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 432 # and filter_unihan_line() are the functions where this is done.  This program
 433 # already does some adjusting to make the lines look more like the rest of the
 434 # Unicode DB;  You can see what that is in filter_unihan_line()
 435 #
 436 # There is a bug in the 3.2 data file in which some values for the
 437 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 438 # could be added for these; or for a particular installation, the Unihan.txt
 439 # file could be edited to fix them.
 440 #
 441 # HOW TO ADD A FILE TO BE PROCESSED
 442 #
 443 # A new file from Unicode needs to have an object constructed for it in
 444 # @input_file_objects, probably at the end or at the end of the extracted
 445 # ones.  The program should warn you if its name will clash with others on
 446 # restrictive file systems, like DOS.  If so, figure out a better name, and
 447 # add lines to the README.perl file giving that.  If the file is a character
 448 # property, it should be in the format that Unicode has by default
 449 # standardized for such files for the more recently introduced ones.
 450 # If so, the Input_file constructor for @input_file_objects can just be the
 451 # file name and release it first appeared in.  If not, then it should be
 452 # possible to construct an each_line_handler() to massage the line into the
 453 # standardized form.
 454 #
 455 # For non-character properties, more code will be needed.  You can look at
 456 # the existing entries for clues.
 457 #
 458 # UNICODE VERSIONS NOTES
 459 #
 460 # The Unicode UCD has had a number of errors in it over the versions.  And
 461 # these remain, by policy, in the standard for that version.  Therefore it is
 462 # risky to correct them, because code may be expecting the error.  So this
 463 # program doesn't generally make changes, unless the error breaks the Perl
 464 # core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
 465 # for U+1105, which causes real problems for the algorithms for Jamo
 466 # calculations, so it is changed here.
 467 #
 468 # But it isn't so clear cut as to what to do about concepts that are
 469 # introduced in a later release; should they extend back to earlier releases
 470 # where the concept just didn't exist?  It was easier to do this than to not,
 471 # so that's what was done.  For example, the default value for code points not
 472 # in the files for various properties was probably undefined until changed by
 473 # some version.  No_Block for blocks is such an example.  This program will
 474 # assign No_Block even in Unicode versions that didn't have it.  This has the
 475 # benefit that code being written doesn't have to special case earlier
 476 # versions; and the detriment that it doesn't match the Standard precisely for
 477 # the affected versions.
 478 #
 479 # Here are some observations about some of the issues in early versions:
 480 #
 481 # The number of code points in \p{alpha} halved in 2.1.9.  It turns out that
 482 # the reason is that the CJK block starting at 4E00 was removed from PropList,
 483 # and was not put back in until 3.1.0
 484 #
 485 # Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
 486 # always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
 487 # reason is that 3.2 introduced U+205F=medium math space, which was not
 488 # classed as white space, but Perl figured out that it should have been. 4.0
 489 # reclassified it correctly.
 490 #
 491 # Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
 492 # this was erroneously a synonym for 202.  In 4.0, ATB became 202, and ATBL
 493 # was left with no code points, as all the ones that mapped to 202 stayed
 494 # mapped to 202.  Thus if your program used the numeric name for the class,
 495 # it would not have been affected, but if it used the mnemonic, it would have
 496 # been.
 497 #
 498 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that code
 499 # points which eventually came to have this script property value, instead
 500 # mapped to "Unknown".  But in the next release all these code points were
 501 # moved to \p{sc=common} instead.
 502 #
 503 # The default for missing code points for BidiClass is complicated.  Starting
 504 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 505 # tries to do the best it can for earlier releases.  It is done in
 506 # process_PropertyAliases()
 507 #
 508 ##############################################################################
 509
 510 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 511                         # and errors
 512 my $MAX_LINE_WIDTH = 78;
 513
 514 # Debugging aid to skip most files so as to not be distracted by them when
 515 # concentrating on the ones being debugged.  Add
 516 # non_skip => 1,
 517 # to the constructor for those files you want processed when you set this.
 518 # Files with a first version number of 0 are special: they are always
 519 # processed regardless of the state of this flag.
 520 my $debug_skip = 0;
 521
 522 # Set to 1 to enable tracing.
 523 our $to_trace = 0;
 524
 525 { # Closure for trace: debugging aid
 526     my $print_caller = 1;        # ? Include calling subroutine name
 527     my $main_with_colon = 'main::';
 528     my $main_colon_length = length($main_with_colon);
 529
 530     sub trace {
 531         return unless $to_trace;        # Do nothing if global flag not set
 532
 533         my @input = @_;
 534
 535         local $DB::trace = 0;
 536         $DB::trace = 0;          # Quiet 'used only once' message
 537
 538         my $line_number;
 539
 540         # Loop looking up the stack to get the first non-trace caller
 541         my $caller_line;
 542         my $caller_name;
 543         my $i = 0;
 544         do {
 545             $line_number = $caller_line;
 546             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 547             $caller = $main_with_colon unless defined $caller;
 548
 549             $caller_name = $caller;
 550
 551             # get rid of pkg
 552             $caller_name =~ s/.*:://;
 553             if (substr($caller_name, 0, $main_colon_length)
 554                 eq $main_with_colon)
 555             {
 556                 $caller_name = substr($caller_name, $main_colon_length);
 557             }
 558
 559         } until ($caller_name ne 'trace');
 560
 561         # If the stack was empty, we were called from the top level
 562         $caller_name = 'main' if ($caller_name eq ""
 563                                     || $caller_name eq 'trace');
 564
 565         my $output = "";
 566         foreach my $string (@input) {
 567             #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 568             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 569                 $output .= simple_dumper($string);
 570             }
 571             else {
 572                 $string = "$string" if ref $string;
 573                 $string = $UNDEF unless defined $string;
 574                 chomp $string;
 575                 $string = '""' if $string eq "";
 576                 $output .= " " if $output ne ""
 577                                 && $string ne ""
 578                                 && substr($output, -1, 1) ne " "
 579                                 && substr($string, 0, 1) ne " ";
 580                 $output .= $string;
 581             }
 582         }
 583
 584         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 585         print STDERR "$caller_name: " if $print_caller;
 586         print STDERR $output, "\n";
 587         return;
 588     }
 589 }
 590
 591 # This is for a rarely used development feature that allows you to compare two
 592 # versions of the Unicode standard without having to deal with changes caused
 593 # by the code points introduced in the later verson.  Change the 0 to a SINGLE
 594 # dotted Unicode release number (e.g. 2.1).  Only code points introduced in
 595 # that release and earlier will be used; later ones are thrown away.  You use
 596 # the version number of the earliest one you want to compare; then run this
 597 # program on directory structures containing each release, and compare the
 598 # outputs.  These outputs will therefore include only the code points common
 599 # to both releases, and you can see the changes caused just by the underlying
 600 # release semantic changes.  For versions earlier than 3.2, you must copy a
 601 # version of DAge.txt into the directory.
 602 my $string_compare_versions = DEBUG && 0; #  e.g., v2.1;
 603 my $compare_versions = DEBUG
 604                        && $string_compare_versions
 605                        && pack "C*", split /\./, $string_compare_versions;
 606
 607 sub uniques {
 608     # Returns non-duplicated input values.  From "Perl Best Practices:
 609     # Encapsulated Cleverness".  p. 455 in first edition.
 610
 611     my %seen;
 612     # Arguably this breaks encapsulation, if the goal is to permit multiple
 613     # distinct objects to stringify to the same value, and be interchangeable.
 614     # However, for this program, no two objects stringify identically, and all
 615     # lists passed to this function are either objects or strings. So this
 616     # doesn't affect correctness, but it does give a couple of percent speedup.
 617     no overloading;
 618     return grep { ! $seen{$_}++ } @_;
 619 }
 620
 621 $0 = File::Spec->canonpath($0);
 622
 623 my $make_test_script = 0;      # ? Should we output a test script
 624 my $write_unchanged_files = 0; # ? Should we update the output files even if
 625                                #    we don't think they have changed
 626 my $use_directory = "";        # ? Should we chdir somewhere.
 627 my $pod_directory;             # input directory to store the pod file.
 628 my $pod_file = 'perluniprops';
 629 my $t_path;                     # Path to the .t test file
 630 my $file_list = 'mktables.lst'; # File to store input and output file names.
 631                                # This is used to speed up the build, by not
 632                                # executing the main body of the program if
 633                                # nothing on the list has changed since the
 634                                # previous build
 635 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 636                                # make a list so that when the pumpking is
 637                                # preparing a release, s/he won't have to do
 638                                # special things
 639 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 640                                # in the input.
 641 my $output_range_counts = $debugging_build;   # ? Should we include the number
 642                                               # of code points in ranges in
 643                                               # the output
 644 my $annotate = 0;              # ? Should character names be in the output
 645
 646 # Verbosity levels; 0 is quiet
 647 my $NORMAL_VERBOSITY = 1;
 648 my $PROGRESS = 2;
 649 my $VERBOSE = 3;
 650
 651 my $verbosity = $NORMAL_VERBOSITY;
 652
 653 # Process arguments
 654 while (@ARGV) {
 655     my $arg = shift @ARGV;
 656     if ($arg eq '-v') {
 657         $verbosity = $VERBOSE;
 658     }
 659     elsif ($arg eq '-p') {
 660         $verbosity = $PROGRESS;
 661         $| = 1;     # Flush buffers as we go.
 662     }
 663     elsif ($arg eq '-q') {
 664         $verbosity = 0;
 665     }
 666     elsif ($arg eq '-w') {
 667         $write_unchanged_files = 1; # update the files even if havent changed
 668     }
 669     elsif ($arg eq '-check') {
 670         my $this = shift @ARGV;
 671         my $ok = shift @ARGV;
 672         if ($this ne $ok) {
 673             print "Skipping as check params are not the same.\n";
 674             exit(0);
 675         }
 676     }
 677     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 678         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 679     }
 680     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 681     {
 682         $make_test_script = 1;
 683     }
 684     elsif ($arg eq '-makelist') {
 685         $make_list = 1;
 686     }
 687     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 688         -d $use_directory or croak "Unknown directory '$use_directory'";
 689     }
 690     elsif ($arg eq '-L') {
 691
 692         # Existence not tested until have chdir'd
 693         $file_list = shift;
 694     }
 695     elsif ($arg eq '-globlist') {
 696         $glob_list = 1;
 697     }
 698     elsif ($arg eq '-c') {
 699         $output_range_counts = ! $output_range_counts
 700     }
 701     elsif ($arg eq '-annotate') {
 702         $annotate = 1;
 703         $debugging_build = 1;
 704         $output_range_counts = 1;
 705     }
 706     else {
 707         my $with_c = 'with';
 708         $with_c .= 'out' if $output_range_counts;   # Complements the state
 709         croak <<END;
 710 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 711           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 712           [-check A B ]
 713   -c          : Output comments $with_c number of code points in ranges
 714   -q          : Quiet Mode: Only output serious warnings.
 715   -p          : Set verbosity level to normal plus show progress.
 716   -v          : Set Verbosity level high:  Show progress and non-serious
 717                 warnings
 718   -w          : Write files regardless
 719   -C dir      : Change to this directory before proceeding. All relative paths
 720                 except those specified by the -P and -T options will be done
 721                 with respect to this directory.
 722   -P dir      : Output $pod_file file to directory 'dir'.
 723   -T path     : Create a test script as 'path'; overrides -maketest
 724   -L filelist : Use alternate 'filelist' instead of standard one
 725   -globlist   : Take as input all non-Test *.txt files in current and sub
 726                 directories
 727   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 728                 overrides -T
 729   -makelist   : Rewrite the file list $file_list based on current setup
 730   -annotate   : Output an annotation for each character in the table files;
 731                 useful for debugging mktables, looking at diffs; but is slow,
 732                 memory intensive; resulting tables are usable but slow and
 733                 very large.
 734   -check A B  : Executes $0 only if A and B are the same
 735 END
 736     }
 737 }
 738
 739 # Stores the most-recently changed file.  If none have changed, can skip the
 740 # build
 741 my $most_recent = (stat $0)[9];   # Do this before the chdir!
 742
 743 # Change directories now, because need to read 'version' early.
 744 if ($use_directory) {
 745     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 746         $pod_directory = File::Spec->rel2abs($pod_directory);
 747     }
 748     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 749         $t_path = File::Spec->rel2abs($t_path);
 750     }
 751     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 752     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 753         $pod_directory = File::Spec->abs2rel($pod_directory);
 754     }
 755     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 756         $t_path = File::Spec->abs2rel($t_path);
 757     }
 758 }
 759
 760 # Get Unicode version into regular and v-string.  This is done now because
 761 # various tables below get populated based on it.  These tables are populated
 762 # here to be near the top of the file, and so easily seeable by those needing
 763 # to modify things.
 764 open my $VERSION, "<", "version"
 765                     or croak "$0: can't open required file 'version': $!\n";
 766 my $string_version = <$VERSION>;
 767 close $VERSION;
 768 chomp $string_version;
 769 my $v_version = pack "C*", split /\./, $string_version;        # v string
 770
 771 # The following are the complete names of properties with property values that
 772 # are known to not match any code points in some versions of Unicode, but that
 773 # may change in the future so they should be matchable, hence an empty file is
 774 # generated for them.
 775 my @tables_that_may_be_empty = (
 776                                 'Joining_Type=Left_Joining',
 777                                 );
 778 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 779 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 780 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 781                                                     if $v_version ge v4.1.0;
 782
 783 # The lists below are hashes, so the key is the item in the list, and the
 784 # value is the reason why it is in the list.  This makes generation of
 785 # documentation easier.
 786
 787 my %why_suppressed;  # No file generated for these.
 788
 789 # Files aren't generated for empty extraneous properties.  This is arguable.
 790 # Extraneous properties generally come about because a property is no longer
 791 # used in a newer version of Unicode.  If we generated a file without code
 792 # points, programs that used to work on that property will still execute
 793 # without errors.  It just won't ever match (or will always match, with \P{}).
 794 # This means that the logic is now likely wrong.  I (khw) think its better to
 795 # find this out by getting an error message.  Just move them to the table
 796 # above to change this behavior
 797 my %why_suppress_if_empty_warn_if_not = (
 798
 799    # It is the only property that has ever officially been removed from the
 800    # Standard.  The database never contained any code points for it.
 801    'Special_Case_Condition' => 'Obsolete',
 802
 803    # Apparently never official, but there were code points in some versions of
 804    # old-style PropList.txt
 805    'Non_Break' => 'Obsolete',
 806 );
 807
 808 # These would normally go in the warn table just above, but they were changed
 809 # a long time before this program was written, so warnings about them are
 810 # moot.
 811 if ($v_version gt v3.2.0) {
 812     push @tables_that_may_be_empty,
 813                                 'Canonical_Combining_Class=Attached_Below_Left'
 814 }
 815
 816 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
 817 # unless explicitly added.
 818 if ($v_version ge v5.2.0) {
 819     my $unihan = 'Unihan; remove from list if using Unihan';
 820     foreach my $table (qw (
 821                            kAccountingNumeric
 822                            kOtherNumeric
 823                            kPrimaryNumeric
 824                            kCompatibilityVariant
 825                            kIICore
 826                            kIRG_GSource
 827                            kIRG_HSource
 828                            kIRG_JSource
 829                            kIRG_KPSource
 830                            kIRG_MSource
 831                            kIRG_KSource
 832                            kIRG_TSource
 833                            kIRG_USource
 834                            kIRG_VSource
 835                            kRSUnicode
 836                         ))
 837     {
 838         $why_suppress_if_empty_warn_if_not{$table} = $unihan;
 839     }
 840 }
 841
 842 # Properties that this program ignores.
 843 my @unimplemented_properties = (
 844 'Unicode_Radical_Stroke'    # Remove if changing to handle this one.
 845 );
 846
 847 # There are several types of obsolete properties defined by Unicode.  These
 848 # must be hand-edited for every new Unicode release.
 849 my %why_deprecated;  # Generates a deprecated warning message if used.
 850 my %why_stabilized;  # Documentation only
 851 my %why_obsolete;    # Documentation only
 852
 853 {   # Closure
 854     my $simple = 'Perl uses the more complete version of this property';
 855     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 856
 857     my $other_properties = 'other properties';
 858     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 859     my $why_no_expand  = "Deprecated by Unicode: less useful than UTF-specific calculations",
 860
 861     %why_deprecated = (
 862         'Grapheme_Link' => 'Deprecated by Unicode:  Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
 863         'Jamo_Short_Name' => $contributory,
 864         'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 865         'Other_Alphabetic' => $contributory,
 866         'Other_Default_Ignorable_Code_Point' => $contributory,
 867         'Other_Grapheme_Extend' => $contributory,
 868         'Other_ID_Continue' => $contributory,
 869         'Other_ID_Start' => $contributory,
 870         'Other_Lowercase' => $contributory,
 871         'Other_Math' => $contributory,
 872         'Other_Uppercase' => $contributory,
 873     );
 874
 875     %why_suppressed = (
 876         # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
 877         # contains the same information, but without the algorithmically
 878         # determinable Hangul syllables'.  This file is not published, so it's
 879         # existence is not noted in the comment.
 880         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
 881
 882         'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo.  Obsoleted, and code points for it removed in Unicode 5.2',
 883         'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo.  If there is no later name for a code point, then this one is used instead in charnames",
 884
 885         'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold",
 886         'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 887         'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 888         'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 889
 890         'Name' => "Accessible via 'use charnames;'",
 891         'Name_Alias' => "Accessible via 'use charnames;'",
 892
 893         FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
 894         Expands_On_NFC => $why_no_expand,
 895         Expands_On_NFD => $why_no_expand,
 896         Expands_On_NFKC => $why_no_expand,
 897         Expands_On_NFKD => $why_no_expand,
 898     );
 899
 900     # The following are suppressed because they were made contributory or
 901     # deprecated by Unicode before Perl ever thought about supporting them.
 902     foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
 903         $why_suppressed{$property} = $why_deprecated{$property};
 904     }
 905
 906     # Customize the message for all the 'Other_' properties
 907     foreach my $property (keys %why_deprecated) {
 908         next if (my $main_property = $property) !~ s/^Other_//;
 909         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
 910     }
 911 }
 912
 913 if ($v_version ge 4.0.0) {
 914     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
 915     if ($v_version ge 6.0.0) {
 916         $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
 917     }
 918 }
 919 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
 920     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
 921     if ($v_version ge 6.0.0) {
 922         $why_deprecated{'ISO_Comment'} = 'No longer needed for chart generation; otherwise not useful, and code points for it have been removed';
 923     }
 924 }
 925
 926 # Probably obsolete forever
 927 if ($v_version ge v4.1.0) {
 928     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common"';
 929 }
 930
 931 # This program can create files for enumerated-like properties, such as
 932 # 'Numeric_Type'.  This file would be the same format as for a string
 933 # property, with a mapping from code point to its value, so you could look up,
 934 # for example, the script a code point is in.  But no one so far wants this
 935 # mapping, or they have found another way to get it since this is a new
 936 # feature.  So no file is generated except if it is in this list.
 937 my @output_mapped_properties = split "\n", <<END;
 938 END
 939
 940 # If you are using the Unihan database, you need to add the properties that
 941 # you want to extract from it to this table.  For your convenience, the
 942 # properties in the 6.0 PropertyAliases.txt file are listed, commented out
 943 my @cjk_properties = split "\n", <<'END';
 944 #cjkAccountingNumeric; kAccountingNumeric
 945 #cjkOtherNumeric; kOtherNumeric
 946 #cjkPrimaryNumeric; kPrimaryNumeric
 947 #cjkCompatibilityVariant; kCompatibilityVariant
 948 #cjkIICore ; kIICore
 949 #cjkIRG_GSource; kIRG_GSource
 950 #cjkIRG_HSource; kIRG_HSource
 951 #cjkIRG_JSource; kIRG_JSource
 952 #cjkIRG_KPSource; kIRG_KPSource
 953 #cjkIRG_KSource; kIRG_KSource
 954 #cjkIRG_TSource; kIRG_TSource
 955 #cjkIRG_USource; kIRG_USource
 956 #cjkIRG_VSource; kIRG_VSource
 957 #cjkRSUnicode; kRSUnicode                ; Unicode_Radical_Stroke; URS
 958 END
 959
 960 # Similarly for the property values.  For your convenience, the lines in the
 961 # 6.0 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
 962 # '#' marks
 963 my @cjk_property_values = split "\n", <<'END';
 964 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
 965 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
 966 ## @missing: 0000..10FFFF; cjkIICore; <none>
 967 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
 968 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
 969 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
 970 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
 971 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
 972 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
 973 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
 974 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
 975 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
 976 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
 977 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
 978 END
 979
 980 # The input files don't list every code point.  Those not listed are to be
 981 # defaulted to some value.  Below are hard-coded what those values are for
 982 # non-binary properties as of 5.1.  Starting in 5.0, there are
 983 # machine-parsable comment lines in the files the give the defaults; so this
 984 # list shouldn't have to be extended.  The claim is that all missing entries
 985 # for binary properties will default to 'N'.  Unicode tried to change that in
 986 # 5.2, but the beta period produced enough protest that they backed off.
 987 #
 988 # The defaults for the fields that appear in UnicodeData.txt in this hash must
 989 # be in the form that it expects.  The others may be synonyms.
 990 my $CODE_POINT = '<code point>';
 991 my %default_mapping = (
 992     Age => "Unassigned",
 993     # Bidi_Class => Complicated; set in code
 994     Bidi_Mirroring_Glyph => "",
 995     Block => 'No_Block',
 996     Canonical_Combining_Class => 0,
 997     Case_Folding => $CODE_POINT,
 998     Decomposition_Mapping => $CODE_POINT,
 999     Decomposition_Type => 'None',
1000     East_Asian_Width => "Neutral",
1001     FC_NFKC_Closure => $CODE_POINT,
1002     General_Category => 'Cn',
1003     Grapheme_Cluster_Break => 'Other',
1004     Hangul_Syllable_Type => 'NA',
1005     ISO_Comment => "",
1006     Jamo_Short_Name => "",
1007     Joining_Group => "No_Joining_Group",
1008     # Joining_Type => Complicated; set in code
1009     kIICore => 'N',   #                       Is converted to binary
1010     #Line_Break => Complicated; set in code
1011     Lowercase_Mapping => $CODE_POINT,
1012     Name => "",
1013     Name_Alias => "",
1014     NFC_QC => 'Yes',
1015     NFD_QC => 'Yes',
1016     NFKC_QC => 'Yes',
1017     NFKD_QC => 'Yes',
1018     Numeric_Type => 'None',
1019     Numeric_Value => 'NaN',
1020     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1021     Sentence_Break => 'Other',
1022     Simple_Case_Folding => $CODE_POINT,
1023     Simple_Lowercase_Mapping => $CODE_POINT,
1024     Simple_Titlecase_Mapping => $CODE_POINT,
1025     Simple_Uppercase_Mapping => $CODE_POINT,
1026     Titlecase_Mapping => $CODE_POINT,
1027     Unicode_1_Name => "",
1028     Unicode_Radical_Stroke => "",
1029     Uppercase_Mapping => $CODE_POINT,
1030     Word_Break => 'Other',
1031 );
1032
1033 # Below are files that Unicode furnishes, but this program ignores, and why
1034 my %ignored_files = (
1035     'CJKRadicals.txt' => 'Unihan data',
1036     'Index.txt' => 'An index, not actual data',
1037     'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
1038     'NamesList.txt' => 'Just adds commentary',
1039     'NormalizationCorrections.txt' => 'Data is already in other files.',
1040     'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
1041     'ReadMe.txt' => 'Just comments',
1042     'README.TXT' => 'Just comments',
1043     'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property.  Does not fit into current scheme where one code point is mapped',
1044     'EmojiSources.txt' => 'Not of general utility: for Japanese legacy cell-phone applications',
1045     'IndicMatraCategory.txt' => 'Provisional',
1046     'IndicSyllabicCategory.txt' => 'Provisional',
1047     'ScriptExtensions.txt' => 'Provisional',
1048 );
1049
1050 ### End of externally interesting definitions, except for @input_file_objects
1051
1052 my $HEADER=<<"EOF";
1053 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1054 # This file is machine-generated by $0 from the Unicode
1055 # database, Version $string_version.  Any changes made here will be lost!
1056 EOF
1057
1058 my $INTERNAL_ONLY=<<"EOF";
1059
1060 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1061 # This file is for internal use by the Perl program only.  The format and even
1062 # the name or existence of this file are subject to change without notice.
1063 # Don't use it directly.
1064 EOF
1065
1066 my $DEVELOPMENT_ONLY=<<"EOF";
1067 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1068 # This file contains information artificially constrained to code points
1069 # present in Unicode release $string_compare_versions.
1070 # IT CANNOT BE RELIED ON.  It is for use during development only and should
1071 # not be used for production.
1072
1073 EOF
1074
1075 my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
1076 my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
1077 my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
1078
1079 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
1080 # two must be 10; if there are 5, the first must not be a 0.  Written this way
1081 # to decrease backtracking
1082 my $code_point_re =
1083         qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1084
1085 # This matches the beginning of the line in the Unicode db files that give the
1086 # defaults for code points not listed (i.e., missing) in the file.  The code
1087 # depends on this ending with a semi-colon, so it can assume it is a valid
1088 # field when the line is split() by semi-colons
1089 my $missing_defaults_prefix =
1090             qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
1091
1092 # Property types.  Unicode has more types, but these are sufficient for our
1093 # purposes.
1094 my $UNKNOWN = -1;   # initialized to illegal value
1095 my $NON_STRING = 1; # Either binary or enum
1096 my $BINARY = 2;
1097 my $ENUM = 3;       # Include catalog
1098 my $STRING = 4;     # Anything else: string or misc
1099
1100 # Some input files have lines that give default values for code points not
1101 # contained in the file.  Sometimes these should be ignored.
1102 my $NO_DEFAULTS = 0;        # Must evaluate to false
1103 my $NOT_IGNORED = 1;
1104 my $IGNORED = 2;
1105
1106 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
1107 # and will appear in the main body of the tables in the output files, but
1108 # there are other types of ranges as well, listed below, that are specially
1109 # handled.   There are pseudo-types as well that will never be stored as a
1110 # type, but will affect the calculation of the type.
1111
1112 # 0 is for normal, non-specials
1113 my $MULTI_CP = 1;           # Sequence of more than code point
1114 my $HANGUL_SYLLABLE = 2;
1115 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1116 my $NULL = 4;               # The map is to the null string; utf8.c can't
1117                             # handle these, nor is there an accepted syntax
1118                             # for them in \p{} constructs
1119 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1120                              # otherwise be $MULTI_CP type are instead type 0
1121
1122 # process_generic_property_file() can accept certain overrides in its input.
1123 # Each of these must begin AND end with $CMD_DELIM.
1124 my $CMD_DELIM = "\a";
1125 my $REPLACE_CMD = 'replace';    # Override the Replace
1126 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1127
1128 my $NO = 0;
1129 my $YES = 1;
1130
1131 # Values for the Replace argument to add_range.
1132 # $NO                      # Don't replace; add only the code points not
1133                            # already present.
1134 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1135                            # the comments at the subroutine definition.
1136 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1137 my $MULTIPLE = 4;          # Don't replace, but add a duplicate record if
1138                            # already there
1139 my $CROAK = 5;             # Die with an error if is already there
1140
1141 # Flags to give property statuses.  The phrases are to remind maintainers that
1142 # if the flag is changed, the indefinite article referring to it in the
1143 # documentation may need to be as well.
1144 my $NORMAL = "";
1145 my $SUPPRESSED = 'z';   # The character should never actually be seen, since
1146                         # it is suppressed
1147 my $PLACEHOLDER = 'P';  # Implies no pod entry generated
1148 my $DEPRECATED = 'D';
1149 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1150 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1151 my $DISCOURAGED = 'X';
1152 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1153 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1154 my $STRICTER = 'T';
1155 my $a_bold_stricter = "a 'B<$STRICTER>'";
1156 my $A_bold_stricter = "A 'B<$STRICTER>'";
1157 my $STABILIZED = 'S';
1158 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1159 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1160 my $OBSOLETE = 'O';
1161 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1162 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1163
1164 my %status_past_participles = (
1165     $DISCOURAGED => 'discouraged',
1166     $SUPPRESSED => 'should never be generated',
1167     $STABILIZED => 'stabilized',
1168     $OBSOLETE => 'obsolete',
1169     $DEPRECATED => 'deprecated',
1170 );
1171
1172 # The format of the values of the tables:
1173 my $EMPTY_FORMAT = "";
1174 my $BINARY_FORMAT = 'b';
1175 my $DECIMAL_FORMAT = 'd';
1176 my $FLOAT_FORMAT = 'f';
1177 my $INTEGER_FORMAT = 'i';
1178 my $HEX_FORMAT = 'x';
1179 my $RATIONAL_FORMAT = 'r';
1180 my $STRING_FORMAT = 's';
1181 my $DECOMP_STRING_FORMAT = 'c';
1182
1183 my %map_table_formats = (
1184     $BINARY_FORMAT => 'binary',
1185     $DECIMAL_FORMAT => 'single decimal digit',
1186     $FLOAT_FORMAT => 'floating point number',
1187     $INTEGER_FORMAT => 'integer',
1188     $HEX_FORMAT => 'positive hex whole number; a code point',
1189     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1190     $STRING_FORMAT => 'string',
1191     $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1192 );
1193
1194 # Unicode didn't put such derived files in a separate directory at first.
1195 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1196 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1197 my $AUXILIARY = 'auxiliary';
1198
1199 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1200 my %loose_to_file_of;       # loosely maps table names to their respective
1201                             # files
1202 my %stricter_to_file_of;    # same; but for stricter mapping.
1203 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1204                              # their rational equivalent
1205 my %loose_property_name_of; # Loosely maps property names to standard form
1206
1207 # These constants names and values were taken from the Unicode standard,
1208 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1209 # syllables.  The '_string' versions are so generated tables can retain the
1210 # hex format, which is the more familiar value
1211 my $SBase_string = "0xAC00";
1212 my $SBase = CORE::hex $SBase_string;
1213 my $LBase_string = "0x1100";
1214 my $LBase = CORE::hex $LBase_string;
1215 my $VBase_string = "0x1161";
1216 my $VBase = CORE::hex $VBase_string;
1217 my $TBase_string = "0x11A7";
1218 my $TBase = CORE::hex $TBase_string;
1219 my $SCount = 11172;
1220 my $LCount = 19;
1221 my $VCount = 21;
1222 my $TCount = 28;
1223 my $NCount = $VCount * $TCount;
1224
1225 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1226 # with the above published constants.
1227 my %Jamo;
1228 my %Jamo_L;     # Leading consonants
1229 my %Jamo_V;     # Vowels
1230 my %Jamo_T;     # Trailing consonants
1231
1232 my @backslash_X_tests;     # List of tests read in for testing \X
1233 my @unhandled_properties;  # Will contain a list of properties found in
1234                            # the input that we didn't process.
1235 my @match_properties;      # Properties that have match tables, to be
1236                            # listed in the pod
1237 my @map_properties;        # Properties that get map files written
1238 my @named_sequences;       # NamedSequences.txt contents.
1239 my %potential_files;       # Generated list of all .txt files in the directory
1240                            # structure so we can warn if something is being
1241                            # ignored.
1242 my @files_actually_output; # List of files we generated.
1243 my @more_Names;            # Some code point names are compound; this is used
1244                            # to store the extra components of them.
1245 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1246                            # the minimum before we consider it equivalent to a
1247                            # candidate rational
1248 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1249
1250 # These store references to certain commonly used property objects
1251 my $gc;
1252 my $perl;
1253 my $block;
1254 my $perl_charname;
1255 my $print;
1256
1257 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1258 my $has_In_conflicts = 0;
1259 my $has_Is_conflicts = 0;
1260
1261 sub internal_file_to_platform ($) {
1262     # Convert our file paths which have '/' separators to those of the
1263     # platform.
1264
1265     my $file = shift;
1266     return undef unless defined $file;
1267
1268     return File::Spec->join(split '/', $file);
1269 }
1270
1271 sub file_exists ($) {   # platform independent '-e'.  This program internally
1272                         # uses slash as a path separator.
1273     my $file = shift;
1274     return 0 if ! defined $file;
1275     return -e internal_file_to_platform($file);
1276 }
1277
1278 sub objaddr($) {
1279     # Returns the address of the blessed input object.
1280     # It doesn't check for blessedness because that would do a string eval
1281     # every call, and the program is structured so that this is never called
1282     # for a non-blessed object.
1283
1284     no overloading; # If overloaded, numifying below won't work.
1285
1286     # Numifying a ref gives its address.
1287     return pack 'J', $_[0];
1288 }
1289
1290 # These are used only if $annotate is true.
1291 # The entire range of Unicode characters is examined to populate these
1292 # after all the input has been processed.  But most can be skipped, as they
1293 # have the same descriptive phrases, such as being unassigned
1294 my @viacode;            # Contains the 1 million character names
1295 my @printable;          # boolean: And are those characters printable?
1296 my @annotate_char_type; # Contains a type of those characters, specifically
1297                         # for the purposes of annotation.
1298 my $annotate_ranges;    # A map of ranges of code points that have the same
1299                         # name for the purposes of annotation.  They map to the
1300                         # upper edge of the range, so that the end point can
1301                         # be immediately found.  This is used to skip ahead to
1302                         # the end of a range, and avoid processing each
1303                         # individual code point in it.
1304 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1305                                    # characters, but excluding those which are
1306                                    # also noncharacter code points
1307
1308 # The annotation types are an extension of the regular range types, though
1309 # some of the latter are folded into one.  Make the new types negative to
1310 # avoid conflicting with the regular types
1311 my $SURROGATE_TYPE = -1;
1312 my $UNASSIGNED_TYPE = -2;
1313 my $PRIVATE_USE_TYPE = -3;
1314 my $NONCHARACTER_TYPE = -4;
1315 my $CONTROL_TYPE = -5;
1316 my $UNKNOWN_TYPE = -6;  # Used only if there is a bug in this program
1317
1318 sub populate_char_info ($) {
1319     # Used only with the $annotate option.  Populates the arrays with the
1320     # input code point's info that are needed for outputting more detailed
1321     # comments.  If calling context wants a return, it is the end point of
1322     # any contiguous range of characters that share essentially the same info
1323
1324     my $i = shift;
1325     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1326
1327     $viacode[$i] = $perl_charname->value_of($i) || "";
1328
1329     # A character is generally printable if Unicode says it is,
1330     # but below we make sure that most Unicode general category 'C' types
1331     # aren't.
1332     $printable[$i] = $print->contains($i);
1333
1334     $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1335
1336     # Only these two regular types are treated specially for annotations
1337     # purposes
1338     $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1339                                 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1340
1341     # Give a generic name to all code points that don't have a real name.
1342     # We output ranges, if applicable, for these.  Also calculate the end
1343     # point of the range.
1344     my $end;
1345     if (! $viacode[$i]) {
1346         if ($gc-> table('Surrogate')->contains($i)) {
1347             $viacode[$i] = 'Surrogate';
1348             $annotate_char_type[$i] = $SURROGATE_TYPE;
1349             $printable[$i] = 0;
1350             $end = $gc->table('Surrogate')->containing_range($i)->end;
1351         }
1352         elsif ($gc-> table('Private_use')->contains($i)) {
1353             $viacode[$i] = 'Private Use';
1354             $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1355             $printable[$i] = 0;
1356             $end = $gc->table('Private_Use')->containing_range($i)->end;
1357         }
1358         elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1359                                                                 contains($i))
1360         {
1361             $viacode[$i] = 'Noncharacter';
1362             $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1363             $printable[$i] = 0;
1364             $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1365                                                     containing_range($i)->end;
1366         }
1367         elsif ($gc-> table('Control')->contains($i)) {
1368             $viacode[$i] = 'Control';
1369             $annotate_char_type[$i] = $CONTROL_TYPE;
1370             $printable[$i] = 0;
1371             $end = 0x81 if $i == 0x80;  # Hard-code this one known case
1372         }
1373         elsif ($gc-> table('Unassigned')->contains($i)) {
1374             $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1375             $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1376             $printable[$i] = 0;
1377
1378             # Because we name the unassigned by the blocks they are in, it
1379             # can't go past the end of that block, and it also can't go past
1380             # the unassigned range it is in.  The special table makes sure
1381             # that the non-characters, which are unassigned, are separated
1382             # out.
1383             $end = min($block->containing_range($i)->end,
1384                        $unassigned_sans_noncharacters-> containing_range($i)->
1385                                                                          end);
1386         }
1387         else {
1388             Carp::my_carp_bug("Can't figure out how to annotate "
1389                               . sprintf("U+%04X", $i)
1390                               . ".  Proceeding anyway.");
1391             $viacode[$i] = 'UNKNOWN';
1392             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1393             $printable[$i] = 0;
1394         }
1395     }
1396
1397     # Here, has a name, but if it's one in which the code point number is
1398     # appended to the name, do that.
1399     elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1400         $viacode[$i] .= sprintf("-%04X", $i);
1401         $end = $perl_charname->containing_range($i)->end;
1402     }
1403
1404     # And here, has a name, but if it's a hangul syllable one, replace it with
1405     # the correct name from the Unicode algorithm
1406     elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1407         use integer;
1408         my $SIndex = $i - $SBase;
1409         my $L = $LBase + $SIndex / $NCount;
1410         my $V = $VBase + ($SIndex % $NCount) / $TCount;
1411         my $T = $TBase + $SIndex % $TCount;
1412         $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1413         $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1414         $end = $perl_charname->containing_range($i)->end;
1415     }
1416
1417     return if ! defined wantarray;
1418     return $i if ! defined $end;    # If not a range, return the input
1419
1420     # Save this whole range so can find the end point quickly
1421     $annotate_ranges->add_map($i, $end, $end);
1422
1423     return $end;
1424 }
1425
1426 # Commented code below should work on Perl 5.8.
1427 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1428 ## the native perl version of it (which is what would operate under miniperl)
1429 ## is extremely slow, as it does a string eval every call.
1430 #my $has_fast_scalar_util = $\18 !~ /miniperl/
1431 #                            && defined eval "require Scalar::Util";
1432 #
1433 #sub objaddr($) {
1434 #    # Returns the address of the blessed input object.  Uses the XS version if
1435 #    # available.  It doesn't check for blessedness because that would do a
1436 #    # string eval every call, and the program is structured so that this is
1437 #    # never called for a non-blessed object.
1438 #
1439 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1440 #
1441 #    # Check at least that is a ref.
1442 #    my $pkg = ref($_[0]) or return undef;
1443 #
1444 #    # Change to a fake package to defeat any overloaded stringify
1445 #    bless $_[0], 'main::Fake';
1446 #
1447 #    # Numifying a ref gives its address.
1448 #    my $addr = pack 'J', $_[0];
1449 #
1450 #    # Return to original class
1451 #    bless $_[0], $pkg;
1452 #    return $addr;
1453 #}
1454
1455 sub max ($$) {
1456     my $a = shift;
1457     my $b = shift;
1458     return $a if $a >= $b;
1459     return $b;
1460 }
1461
1462 sub min ($$) {
1463     my $a = shift;
1464     my $b = shift;
1465     return $a if $a <= $b;
1466     return $b;
1467 }
1468
1469 sub clarify_number ($) {
1470     # This returns the input number with underscores inserted every 3 digits
1471     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1472     # checked.
1473
1474     my $number = shift;
1475     my $pos = length($number) - 3;
1476     return $number if $pos <= 1;
1477     while ($pos > 0) {
1478         substr($number, $pos, 0) = '_';
1479         $pos -= 3;
1480     }
1481     return $number;
1482 }
1483
1484
1485 package Carp;
1486
1487 # These routines give a uniform treatment of messages in this program.  They
1488 # are placed in the Carp package to cause the stack trace to not include them,
1489 # although an alternative would be to use another package and set @CARP_NOT
1490 # for it.
1491
1492 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1493
1494 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1495 # and overload trying to load Scalar:Util under miniperl.  See
1496 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1497 undef $overload::VERSION;
1498
1499 sub my_carp {
1500     my $message = shift || "";
1501     my $nofold = shift || 0;
1502
1503     if ($message) {
1504         $message = main::join_lines($message);
1505         $message =~ s/^$0: *//;     # Remove initial program name
1506         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1507         $message = "\n$0: $message;";
1508
1509         # Fold the message with program name, semi-colon end punctuation
1510         # (which looks good with the message that carp appends to it), and a
1511         # hanging indent for continuation lines.
1512         $message = main::simple_fold($message, "", 4) unless $nofold;
1513         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1514                                     # appends is to the same line
1515     }
1516
1517     return $message if defined wantarray;   # If a caller just wants the msg
1518
1519     carp $message;
1520     return;
1521 }
1522
1523 sub my_carp_bug {
1524     # This is called when it is clear that the problem is caused by a bug in
1525     # this program.
1526
1527     my $message = shift;
1528     $message =~ s/^$0: *//;
1529     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1530     carp $message;
1531     return;
1532 }
1533
1534 sub carp_too_few_args {
1535     if (@_ != 2) {
1536         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1537         return;
1538     }
1539
1540     my $args_ref = shift;
1541     my $count = shift;
1542
1543     my_carp_bug("Need at least $count arguments to "
1544         . (caller 1)[3]
1545         . ".  Instead got: '"
1546         . join ', ', @$args_ref
1547         . "'.  No action taken.");
1548     return;
1549 }
1550
1551 sub carp_extra_args {
1552     my $args_ref = shift;
1553     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1554
1555     unless (ref $args_ref) {
1556         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1557         return;
1558     }
1559     my ($package, $file, $line) = caller;
1560     my $subroutine = (caller 1)[3];
1561
1562     my $list;
1563     if (ref $args_ref eq 'HASH') {
1564         foreach my $key (keys %$args_ref) {
1565             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1566         }
1567         $list = join ', ', each %{$args_ref};
1568     }
1569     elsif (ref $args_ref eq 'ARRAY') {
1570         foreach my $arg (@$args_ref) {
1571             $arg = $UNDEF unless defined $arg;
1572         }
1573         $list = join ', ', @$args_ref;
1574     }
1575     else {
1576         my_carp_bug("Can't cope with ref "
1577                 . ref($args_ref)
1578                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1579         return;
1580     }
1581
1582     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1583     return;
1584 }
1585
1586 package main;
1587
1588 { # Closure
1589
1590     # This program uses the inside-out method for objects, as recommended in
1591     # "Perl Best Practices".  This closure aids in generating those.  There
1592     # are two routines.  setup_package() is called once per package to set
1593     # things up, and then set_access() is called for each hash representing a
1594     # field in the object.  These routines arrange for the object to be
1595     # properly destroyed when no longer used, and for standard accessor
1596     # functions to be generated.  If you need more complex accessors, just
1597     # write your own and leave those accesses out of the call to set_access().
1598     # More details below.
1599
1600     my %constructor_fields; # fields that are to be used in constructors; see
1601                             # below
1602
1603     # The values of this hash will be the package names as keys to other
1604     # hashes containing the name of each field in the package as keys, and
1605     # references to their respective hashes as values.
1606     my %package_fields;
1607
1608     sub setup_package {
1609         # Sets up the package, creating standard DESTROY and dump methods
1610         # (unless already defined).  The dump method is used in debugging by
1611         # simple_dumper().
1612         # The optional parameters are:
1613         #   a)  a reference to a hash, that gets populated by later
1614         #       set_access() calls with one of the accesses being
1615         #       'constructor'.  The caller can then refer to this, but it is
1616         #       not otherwise used by these two routines.
1617         #   b)  a reference to a callback routine to call during destruction
1618         #       of the object, before any fields are actually destroyed
1619
1620         my %args = @_;
1621         my $constructor_ref = delete $args{'Constructor_Fields'};
1622         my $destroy_callback = delete $args{'Destroy_Callback'};
1623         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1624
1625         my %fields;
1626         my $package = (caller)[0];
1627
1628         $package_fields{$package} = \%fields;
1629         $constructor_fields{$package} = $constructor_ref;
1630
1631         unless ($package->can('DESTROY')) {
1632             my $destroy_name = "${package}::DESTROY";
1633             no strict "refs";
1634
1635             # Use typeglob to give the anonymous subroutine the name we want
1636             *$destroy_name = sub {
1637                 my $self = shift;
1638                 my $addr = do { no overloading; pack 'J', $self; };
1639
1640                 $self->$destroy_callback if $destroy_callback;
1641                 foreach my $field (keys %{$package_fields{$package}}) {
1642                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1643                     delete $package_fields{$package}{$field}{$addr};
1644                 }
1645                 return;
1646             }
1647         }
1648
1649         unless ($package->can('dump')) {
1650             my $dump_name = "${package}::dump";
1651             no strict "refs";
1652             *$dump_name = sub {
1653                 my $self = shift;
1654                 return dump_inside_out($self, $package_fields{$package}, @_);
1655             }
1656         }
1657         return;
1658     }
1659
1660     sub set_access {
1661         # Arrange for the input field to be garbage collected when no longer
1662         # needed.  Also, creates standard accessor functions for the field
1663         # based on the optional parameters-- none if none of these parameters:
1664         #   'addable'    creates an 'add_NAME()' accessor function.
1665         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1666         #                function.
1667         #   'settable'   creates a 'set_NAME()' accessor function.
1668         #   'constructor' doesn't create an accessor function, but adds the
1669         #                field to the hash that was previously passed to
1670         #                setup_package();
1671         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1672         # 'add' etc. all mean 'addable'.
1673         # The read accessor function will work on both array and scalar
1674         # values.  If another accessor in the parameter list is 'a', the read
1675         # access assumes an array.  You can also force it to be array access
1676         # by specifying 'readable_array' instead of 'readable'
1677         #
1678         # A sort-of 'protected' access can be set-up by preceding the addable,
1679         # readable or settable with some initial portion of 'protected_' (but,
1680         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1681         # "protection" is only by convention.  All that happens is that the
1682         # accessor functions' names begin with an underscore.  So instead of
1683         # calling set_foo, the call is _set_foo.  (Real protection could be
1684         # accomplished by having a new subroutine, end_package, called at the
1685         # end of each package, and then storing the __LINE__ ranges and
1686         # checking them on every accessor.  But that is way overkill.)
1687
1688         # We create anonymous subroutines as the accessors and then use
1689         # typeglobs to assign them to the proper package and name
1690
1691         my $name = shift;   # Name of the field
1692         my $field = shift;  # Reference to the inside-out hash containing the
1693                             # field
1694
1695         my $package = (caller)[0];
1696
1697         if (! exists $package_fields{$package}) {
1698             croak "$0: Must call 'setup_package' before 'set_access'";
1699         }
1700
1701         # Stash the field so DESTROY can get it.
1702         $package_fields{$package}{$name} = $field;
1703
1704         # Remaining arguments are the accessors.  For each...
1705         foreach my $access (@_) {
1706             my $access = lc $access;
1707
1708             my $protected = "";
1709
1710             # Match the input as far as it goes.
1711             if ($access =~ /^(p[^_]*)_/) {
1712                 $protected = $1;
1713                 if (substr('protected_', 0, length $protected)
1714                     eq $protected)
1715                 {
1716
1717                     # Add 1 for the underscore not included in $protected
1718                     $access = substr($access, length($protected) + 1);
1719                     $protected = '_';
1720                 }
1721                 else {
1722                     $protected = "";
1723                 }
1724             }
1725
1726             if (substr('addable', 0, length $access) eq $access) {
1727                 my $subname = "${package}::${protected}add_$name";
1728                 no strict "refs";
1729
1730                 # add_ accessor.  Don't add if already there, which we
1731                 # determine using 'eq' for scalars and '==' otherwise.
1732                 *$subname = sub {
1733                     use strict "refs";
1734                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1735                     my $self = shift;
1736                     my $value = shift;
1737                     my $addr = do { no overloading; pack 'J', $self; };
1738                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1739                     if (ref $value) {
1740                         return if grep { $value == $_ } @{$field->{$addr}};
1741                     }
1742                     else {
1743                         return if grep { $value eq $_ } @{$field->{$addr}};
1744                     }
1745                     push @{$field->{$addr}}, $value;
1746                     return;
1747                 }
1748             }
1749             elsif (substr('constructor', 0, length $access) eq $access) {
1750                 if ($protected) {
1751                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
1752                 }
1753                 else {
1754                     $constructor_fields{$package}{$name} = $field;
1755                 }
1756             }
1757             elsif (substr('readable_array', 0, length $access) eq $access) {
1758
1759                 # Here has read access.  If one of the other parameters for
1760                 # access is array, or this one specifies array (by being more
1761                 # than just 'readable_'), then create a subroutine that
1762                 # assumes the data is an array.  Otherwise just a scalar
1763                 my $subname = "${package}::${protected}$name";
1764                 if (grep { /^a/i } @_
1765                     or length($access) > length('readable_'))
1766                 {
1767                     no strict "refs";
1768                     *$subname = sub {
1769                         use strict "refs";
1770                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1771                         my $addr = do { no overloading; pack 'J', $_[0]; };
1772                         if (ref $field->{$addr} ne 'ARRAY') {
1773                             my $type = ref $field->{$addr};
1774                             $type = 'scalar' unless $type;
1775                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1776                             return;
1777                         }
1778                         return scalar @{$field->{$addr}} unless wantarray;
1779
1780                         # Make a copy; had problems with caller modifying the
1781                         # original otherwise
1782                         my @return = @{$field->{$addr}};
1783                         return @return;
1784                     }
1785                 }
1786                 else {
1787
1788                     # Here not an array value, a simpler function.
1789                     no strict "refs";
1790                     *$subname = sub {
1791                         use strict "refs";
1792                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1793                         no overloading;
1794                         return $field->{pack 'J', $_[0]};
1795                     }
1796                 }
1797             }
1798             elsif (substr('settable', 0, length $access) eq $access) {
1799                 my $subname = "${package}::${protected}set_$name";
1800                 no strict "refs";
1801                 *$subname = sub {
1802                     use strict "refs";
1803                     if (main::DEBUG) {
1804                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1805                         Carp::carp_extra_args(\@_) if @_ > 2;
1806                     }
1807                     # $self is $_[0]; $value is $_[1]
1808                     no overloading;
1809                     $field->{pack 'J', $_[0]} = $_[1];
1810                     return;
1811                 }
1812             }
1813             else {
1814                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
1815             }
1816         }
1817         return;
1818     }
1819 }
1820
1821 package Input_file;
1822
1823 # All input files use this object, which stores various attributes about them,
1824 # and provides for convenient, uniform handling.  The run method wraps the
1825 # processing.  It handles all the bookkeeping of opening, reading, and closing
1826 # the file, returning only significant input lines.
1827 #
1828 # Each object gets a handler which processes the body of the file, and is
1829 # called by run().  Most should use the generic, default handler, which has
1830 # code scrubbed to handle things you might not expect.  A handler should
1831 # basically be a while(next_line()) {...} loop.
1832 #
1833 # You can also set up handlers to
1834 #   1) call before the first line is read for pre processing
1835 #   2) call to adjust each line of the input before the main handler gets them
1836 #   3) call upon EOF before the main handler exits its loop
1837 #   4) call at the end for post processing
1838 #
1839 # $_ is used to store the input line, and is to be filtered by the
1840 # each_line_handler()s.  So, if the format of the line is not in the desired
1841 # format for the main handler, these are used to do that adjusting.  They can
1842 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1843 # so the $_ output of one is used as the input to the next.  None of the other
1844 # handlers are stackable, but could easily be changed to be so.
1845 #
1846 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1847 # which insert the parameters as lines to be processed before the next input
1848 # file line is read.  This allows the EOF handler to flush buffers, for
1849 # example.  The difference between the two routines is that the lines inserted
1850 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
1851 # called it from such a handler, you would get infinite recursion.)  Lines
1852 # inserted by insert_adjusted_lines() go directly to the main handler without
1853 # any adjustments.  If the  post-processing handler calls any of these, there
1854 # will be no effect.  Some error checking for these conditions could be added,
1855 # but it hasn't been done.
1856 #
1857 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1858 # to prevent further processing of the line.  This routine will output the
1859 # message as a warning once, and then keep a count of the lines that have the
1860 # same message, and output that count at the end of the file's processing.
1861 # This keeps the number of messages down to a manageable amount.
1862 #
1863 # get_missings() should be called to retrieve any @missing input lines.
1864 # Messages will be raised if this isn't done if the options aren't to ignore
1865 # missings.
1866
1867 sub trace { return main::trace(@_); }
1868
1869 { # Closure
1870     # Keep track of fields that are to be put into the constructor.
1871     my %constructor_fields;
1872
1873     main::setup_package(Constructor_Fields => \%constructor_fields);
1874
1875     my %file; # Input file name, required
1876     main::set_access('file', \%file, qw{ c r });
1877
1878     my %first_released; # Unicode version file was first released in, required
1879     main::set_access('first_released', \%first_released, qw{ c r });
1880
1881     my %handler;    # Subroutine to process the input file, defaults to
1882                     # 'process_generic_property_file'
1883     main::set_access('handler', \%handler, qw{ c });
1884
1885     my %property;
1886     # name of property this file is for.  defaults to none, meaning not
1887     # applicable, or is otherwise determinable, for example, from each line.
1888     main::set_access('property', \%property, qw{ c });
1889
1890     my %optional;
1891     # If this is true, the file is optional.  If not present, no warning is
1892     # output.  If it is present, the string given by this parameter is
1893     # evaluated, and if false the file is not processed.
1894     main::set_access('optional', \%optional, 'c', 'r');
1895
1896     my %non_skip;
1897     # This is used for debugging, to skip processing of all but a few input
1898     # files.  Add 'non_skip => 1' to the constructor for those files you want
1899     # processed when you set the $debug_skip global.
1900     main::set_access('non_skip', \%non_skip, 'c');
1901
1902     my %skip;
1903     # This is used to skip processing of this input file semi-permanently.
1904     # It is used for files that we aren't planning to process anytime soon,
1905     # but want to allow to be in the directory and not raise a message that we
1906     # are not handling.  Mostly for test files.  This is in contrast to the
1907     # non_skip element, which is supposed to be used very temporarily for
1908     # debugging.  Sets 'optional' to 1
1909     main::set_access('skip', \%skip, 'c');
1910
1911     my %each_line_handler;
1912     # list of subroutines to look at and filter each non-comment line in the
1913     # file.  defaults to none.  The subroutines are called in order, each is
1914     # to adjust $_ for the next one, and the final one adjusts it for
1915     # 'handler'
1916     main::set_access('each_line_handler', \%each_line_handler, 'c');
1917
1918     my %has_missings_defaults;
1919     # ? Are there lines in the file giving default values for code points
1920     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
1921     # the norm, but IGNORED means it has such lines, but the handler doesn't
1922     # use them.  Having these three states allows us to catch changes to the
1923     # UCD that this program should track
1924     main::set_access('has_missings_defaults',
1925                                         \%has_missings_defaults, qw{ c r });
1926
1927     my %pre_handler;
1928     # Subroutine to call before doing anything else in the file.  If undef, no
1929     # such handler is called.
1930     main::set_access('pre_handler', \%pre_handler, qw{ c });
1931
1932     my %eof_handler;
1933     # Subroutine to call upon getting an EOF on the input file, but before
1934     # that is returned to the main handler.  This is to allow buffers to be
1935     # flushed.  The handler is expected to call insert_lines() or
1936     # insert_adjusted() with the buffered material
1937     main::set_access('eof_handler', \%eof_handler, qw{ c r });
1938
1939     my %post_handler;
1940     # Subroutine to call after all the lines of the file are read in and
1941     # processed.  If undef, no such handler is called.
1942     main::set_access('post_handler', \%post_handler, qw{ c });
1943
1944     my %progress_message;
1945     # Message to print to display progress in lieu of the standard one
1946     main::set_access('progress_message', \%progress_message, qw{ c });
1947
1948     my %handle;
1949     # cache open file handle, internal.  Is undef if file hasn't been
1950     # processed at all, empty if has;
1951     main::set_access('handle', \%handle);
1952
1953     my %added_lines;
1954     # cache of lines added virtually to the file, internal
1955     main::set_access('added_lines', \%added_lines);
1956
1957     my %errors;
1958     # cache of errors found, internal
1959     main::set_access('errors', \%errors);
1960
1961     my %missings;
1962     # storage of '@missing' defaults lines
1963     main::set_access('missings', \%missings);
1964
1965     sub new {
1966         my $class = shift;
1967
1968         my $self = bless \do{ my $anonymous_scalar }, $class;
1969         my $addr = do { no overloading; pack 'J', $self; };
1970
1971         # Set defaults
1972         $handler{$addr} = \&main::process_generic_property_file;
1973         $non_skip{$addr} = 0;
1974         $skip{$addr} = 0;
1975         $has_missings_defaults{$addr} = $NO_DEFAULTS;
1976         $handle{$addr} = undef;
1977         $added_lines{$addr} = [ ];
1978         $each_line_handler{$addr} = [ ];
1979         $errors{$addr} = { };
1980         $missings{$addr} = [ ];
1981
1982         # Two positional parameters.
1983         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1984         $file{$addr} = main::internal_file_to_platform(shift);
1985         $first_released{$addr} = shift;
1986
1987         # The rest of the arguments are key => value pairs
1988         # %constructor_fields has been set up earlier to list all possible
1989         # ones.  Either set or push, depending on how the default has been set
1990         # up just above.
1991         my %args = @_;
1992         foreach my $key (keys %args) {
1993             my $argument = $args{$key};
1994
1995             # Note that the fields are the lower case of the constructor keys
1996             my $hash = $constructor_fields{lc $key};
1997             if (! defined $hash) {
1998                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
1999                 next;
2000             }
2001             if (ref $hash->{$addr} eq 'ARRAY') {
2002                 if (ref $argument eq 'ARRAY') {
2003                     foreach my $argument (@{$argument}) {
2004                         next if ! defined $argument;
2005                         push @{$hash->{$addr}}, $argument;
2006                     }
2007                 }
2008                 else {
2009                     push @{$hash->{$addr}}, $argument if defined $argument;
2010                 }
2011             }
2012             else {
2013                 $hash->{$addr} = $argument;
2014             }
2015             delete $args{$key};
2016         };
2017
2018         # If the file has a property for it, it means that the property is not
2019         # listed in the file's entries.  So add a handler to the list of line
2020         # handlers to insert the property name into the lines, to provide a
2021         # uniform interface to the final processing subroutine.
2022         # the final code doesn't have to worry about that.
2023         if ($property{$addr}) {
2024             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2025         }
2026
2027         if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2028             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2029         }
2030
2031         $optional{$addr} = 1 if $skip{$addr};
2032
2033         return $self;
2034     }
2035
2036
2037     use overload
2038         fallback => 0,
2039         qw("") => "_operator_stringify",
2040         "." => \&main::_operator_dot,
2041     ;
2042
2043     sub _operator_stringify {
2044         my $self = shift;
2045
2046         return __PACKAGE__ . " object for " . $self->file;
2047     }
2048
2049     # flag to make sure extracted files are processed early
2050     my $seen_non_extracted_non_age = 0;
2051
2052     sub run {
2053         # Process the input object $self.  This opens and closes the file and
2054         # calls all the handlers for it.  Currently,  this can only be called
2055         # once per file, as it destroy's the EOF handler
2056
2057         my $self = shift;
2058         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2059
2060         my $addr = do { no overloading; pack 'J', $self; };
2061
2062         my $file = $file{$addr};
2063
2064         # Don't process if not expecting this file (because released later
2065         # than this Unicode version), and isn't there.  This means if someone
2066         # copies it into an earlier version's directory, we will go ahead and
2067         # process it.
2068         return if $first_released{$addr} gt $v_version && ! -e $file;
2069
2070         # If in debugging mode and this file doesn't have the non-skip
2071         # flag set, and isn't one of the critical files, skip it.
2072         if ($debug_skip
2073             && $first_released{$addr} ne v0
2074             && ! $non_skip{$addr})
2075         {
2076             print "Skipping $file in debugging\n" if $verbosity;
2077             return;
2078         }
2079
2080         # File could be optional
2081         if ($optional{$addr}) {
2082             return unless -e $file;
2083             my $result = eval $optional{$addr};
2084             if (! defined $result) {
2085                 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}.  $file Skipped.");
2086                 return;
2087             }
2088             if (! $result) {
2089                 if ($verbosity) {
2090                     print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2091                 }
2092                 return;
2093             }
2094         }
2095
2096         if (! defined $file || ! -e $file) {
2097
2098             # If the file doesn't exist, see if have internal data for it
2099             # (based on first_released being 0).
2100             if ($first_released{$addr} eq v0) {
2101                 $handle{$addr} = 'pretend_is_open';
2102             }
2103             else {
2104                 if (! $optional{$addr}  # File could be optional
2105                     && $v_version ge $first_released{$addr})
2106                 {
2107                     print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2108                 }
2109                 return;
2110             }
2111         }
2112         else {
2113
2114             # Here, the file exists.  Some platforms may change the case of
2115             # its name
2116             if ($seen_non_extracted_non_age) {
2117                 if ($file =~ /$EXTRACTED/i) {
2118                     Carp::my_carp_bug(join_lines(<<END
2119 $file should be processed just after the 'Prop...Alias' files, and before
2120 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2121 have subtle problems
2122 END
2123                     ));
2124                 }
2125             }
2126             elsif ($EXTRACTED_DIR
2127                     && $first_released{$addr} ne v0
2128                     && $file !~ /$EXTRACTED/i
2129                     && lc($file) ne 'dage.txt')
2130             {
2131                 # We don't set this (by the 'if' above) if we have no
2132                 # extracted directory, so if running on an early version,
2133                 # this test won't work.  Not worth worrying about.
2134                 $seen_non_extracted_non_age = 1;
2135             }
2136
2137             # And mark the file as having being processed, and warn if it
2138             # isn't a file we are expecting.  As we process the files,
2139             # they are deleted from the hash, so any that remain at the
2140             # end of the program are files that we didn't process.
2141             my $fkey = File::Spec->rel2abs($file);
2142             my $expecting = delete $potential_files{$fkey};
2143             $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
2144             Carp::my_carp("Was not expecting '$file'.") if
2145                     ! $expecting
2146                     && ! defined $handle{$addr};
2147
2148             # Having deleted from expected files, we can quit if not to do
2149             # anything.  Don't print progress unless really want verbosity
2150             if ($skip{$addr}) {
2151                 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2152                 return;
2153             }
2154
2155             # Open the file, converting the slashes used in this program
2156             # into the proper form for the OS
2157             my $file_handle;
2158             if (not open $file_handle, "<", $file) {
2159                 Carp::my_carp("Can't open $file.  Skipping: $!");
2160                 return 0;
2161             }
2162             $handle{$addr} = $file_handle; # Cache the open file handle
2163         }
2164
2165         if ($verbosity >= $PROGRESS) {
2166             if ($progress_message{$addr}) {
2167                 print "$progress_message{$addr}\n";
2168             }
2169             else {
2170                 # If using a virtual file, say so.
2171                 print "Processing ", (-e $file)
2172                                        ? $file
2173                                        : "substitute $file",
2174                                      "\n";
2175             }
2176         }
2177
2178
2179         # Call any special handler for before the file.
2180         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2181
2182         # Then the main handler
2183         &{$handler{$addr}}($self);
2184
2185         # Then any special post-file handler.
2186         &{$post_handler{$addr}}($self) if $post_handler{$addr};
2187
2188         # If any errors have been accumulated, output the counts (as the first
2189         # error message in each class was output when it was encountered).
2190         if ($errors{$addr}) {
2191             my $total = 0;
2192             my $types = 0;
2193             foreach my $error (keys %{$errors{$addr}}) {
2194                 $total += $errors{$addr}->{$error};
2195                 delete $errors{$addr}->{$error};
2196                 $types++;
2197             }
2198             if ($total > 1) {
2199                 my $message
2200                         = "A total of $total lines had errors in $file.  ";
2201
2202                 $message .= ($types == 1)
2203                             ? '(Only the first one was displayed.)'
2204                             : '(Only the first of each type was displayed.)';
2205                 Carp::my_carp($message);
2206             }
2207         }
2208
2209         if (@{$missings{$addr}}) {
2210             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2211         }
2212
2213         # If a real file handle, close it.
2214         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2215                                                         ref $handle{$addr};
2216         $handle{$addr} = "";   # Uses empty to indicate that has already seen
2217                                # the file, as opposed to undef
2218         return;
2219     }
2220
2221     sub next_line {
2222         # Sets $_ to be the next logical input line, if any.  Returns non-zero
2223         # if such a line exists.  'logical' means that any lines that have
2224         # been added via insert_lines() will be returned in $_ before the file
2225         # is read again.
2226
2227         my $self = shift;
2228         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2229
2230         my $addr = do { no overloading; pack 'J', $self; };
2231
2232         # Here the file is open (or if the handle is not a ref, is an open
2233         # 'virtual' file).  Get the next line; any inserted lines get priority
2234         # over the file itself.
2235         my $adjusted;
2236
2237         LINE:
2238         while (1) { # Loop until find non-comment, non-empty line
2239             #local $to_trace = 1 if main::DEBUG;
2240             my $inserted_ref = shift @{$added_lines{$addr}};
2241             if (defined $inserted_ref) {
2242                 ($adjusted, $_) = @{$inserted_ref};
2243                 trace $adjusted, $_ if main::DEBUG && $to_trace;
2244                 return 1 if $adjusted;
2245             }
2246             else {
2247                 last if ! ref $handle{$addr}; # Don't read unless is real file
2248                 last if ! defined ($_ = readline $handle{$addr});
2249             }
2250             chomp;
2251             trace $_ if main::DEBUG && $to_trace;
2252
2253             # See if this line is the comment line that defines what property
2254             # value that code points that are not listed in the file should
2255             # have.  The format or existence of these lines is not guaranteed
2256             # by Unicode since they are comments, but the documentation says
2257             # that this was added for machine-readability, so probably won't
2258             # change.  This works starting in Unicode Version 5.0.  They look
2259             # like:
2260             #
2261             # @missing: 0000..10FFFF; Not_Reordered
2262             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2263             # @missing: 0000..10FFFF; ; NaN
2264             #
2265             # Save the line for a later get_missings() call.
2266             if (/$missing_defaults_prefix/) {
2267                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2268                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2269                 }
2270                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2271                     my @defaults = split /\s* ; \s*/x, $_;
2272
2273                     # The first field is the @missing, which ends in a
2274                     # semi-colon, so can safely shift.
2275                     shift @defaults;
2276
2277                     # Some of these lines may have empty field placeholders
2278                     # which get in the way.  An example is:
2279                     # @missing: 0000..10FFFF; ; NaN
2280                     # Remove them.  Process starting from the top so the
2281                     # splice doesn't affect things still to be looked at.
2282                     for (my $i = @defaults - 1; $i >= 0; $i--) {
2283                         next if $defaults[$i] ne "";
2284                         splice @defaults, $i, 1;
2285                     }
2286
2287                     # What's left should be just the property (maybe) and the
2288                     # default.  Having only one element means it doesn't have
2289                     # the property.
2290                     my $default;
2291                     my $property;
2292                     if (@defaults >= 1) {
2293                         if (@defaults == 1) {
2294                             $default = $defaults[0];
2295                         }
2296                         else {
2297                             $property = $defaults[0];
2298                             $default = $defaults[1];
2299                         }
2300                     }
2301
2302                     if (@defaults < 1
2303                         || @defaults > 2
2304                         || ($default =~ /^</
2305                             && $default !~ /^<code *point>$/i
2306                             && $default !~ /^<none>$/i))
2307                     {
2308                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2309                     }
2310                     else {
2311
2312                         # If the property is missing from the line, it should
2313                         # be the one for the whole file
2314                         $property = $property{$addr} if ! defined $property;
2315
2316                         # Change <none> to the null string, which is what it
2317                         # really means.  If the default is the code point
2318                         # itself, set it to <code point>, which is what
2319                         # Unicode uses (but sometimes they've forgotten the
2320                         # space)
2321                         if ($default =~ /^<none>$/i) {
2322                             $default = "";
2323                         }
2324                         elsif ($default =~ /^<code *point>$/i) {
2325                             $default = $CODE_POINT;
2326                         }
2327
2328                         # Store them as a sub-arrays with both components.
2329                         push @{$missings{$addr}}, [ $default, $property ];
2330                     }
2331                 }
2332
2333                 # There is nothing for the caller to process on this comment
2334                 # line.
2335                 next;
2336             }
2337
2338             # Remove comments and trailing space, and skip this line if the
2339             # result is empty
2340             s/#.*//;
2341             s/\s+$//;
2342             next if /^$/;
2343
2344             # Call any handlers for this line, and skip further processing of
2345             # the line if the handler sets the line to null.
2346             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2347                 &{$sub_ref}($self);
2348                 next LINE if /^$/;
2349             }
2350
2351             # Here the line is ok.  return success.
2352             return 1;
2353         } # End of looping through lines.
2354
2355         # If there is an EOF handler, call it (only once) and if it generates
2356         # more lines to process go back in the loop to handle them.
2357         if ($eof_handler{$addr}) {
2358             &{$eof_handler{$addr}}($self);
2359             $eof_handler{$addr} = "";   # Currently only get one shot at it.
2360             goto LINE if $added_lines{$addr};
2361         }
2362
2363         # Return failure -- no more lines.
2364         return 0;
2365
2366     }
2367
2368 #   Not currently used, not fully tested.
2369 #    sub peek {
2370 #        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2371 #        # record.  Not callable from an each_line_handler(), nor does it call
2372 #        # an each_line_handler() on the line.
2373 #
2374 #        my $self = shift;
2375 #        my $addr = do { no overloading; pack 'J', $self; };
2376 #
2377 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
2378 #            my ($adjusted, $line) = @{$inserted_ref};
2379 #            next if $adjusted;
2380 #
2381 #            # Remove comments and trailing space, and return a non-empty
2382 #            # resulting line
2383 #            $line =~ s/#.*//;
2384 #            $line =~ s/\s+$//;
2385 #            return $line if $line ne "";
2386 #        }
2387 #
2388 #        return if ! ref $handle{$addr}; # Don't read unless is real file
2389 #        while (1) { # Loop until find non-comment, non-empty line
2390 #            local $to_trace = 1 if main::DEBUG;
2391 #            trace $_ if main::DEBUG && $to_trace;
2392 #            return if ! defined (my $line = readline $handle{$addr});
2393 #            chomp $line;
2394 #            push @{$added_lines{$addr}}, [ 0, $line ];
2395 #
2396 #            $line =~ s/#.*//;
2397 #            $line =~ s/\s+$//;
2398 #            return $line if $line ne "";
2399 #        }
2400 #
2401 #        return;
2402 #    }
2403
2404
2405     sub insert_lines {
2406         # Lines can be inserted so that it looks like they were in the input
2407         # file at the place it was when this routine is called.  See also
2408         # insert_adjusted_lines().  Lines inserted via this routine go through
2409         # any each_line_handler()
2410
2411         my $self = shift;
2412
2413         # Each inserted line is an array, with the first element being 0 to
2414         # indicate that this line hasn't been adjusted, and needs to be
2415         # processed.
2416         no overloading;
2417         push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2418         return;
2419     }
2420
2421     sub insert_adjusted_lines {
2422         # Lines can be inserted so that it looks like they were in the input
2423         # file at the place it was when this routine is called.  See also
2424         # insert_lines().  Lines inserted via this routine are already fully
2425         # adjusted, ready to be processed; each_line_handler()s handlers will
2426         # not be called.  This means this is not a completely general
2427         # facility, as only the last each_line_handler on the stack should
2428         # call this.  It could be made more general, by passing to each of the
2429         # line_handlers their position on the stack, which they would pass on
2430         # to this routine, and that would replace the boolean first element in
2431         # the anonymous array pushed here, so that the next_line routine could
2432         # use that to call only those handlers whose index is after it on the
2433         # stack.  But this is overkill for what is needed now.
2434
2435         my $self = shift;
2436         trace $_[0] if main::DEBUG && $to_trace;
2437
2438         # Each inserted line is an array, with the first element being 1 to
2439         # indicate that this line has been adjusted
2440         no overloading;
2441         push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2442         return;
2443     }
2444
2445     sub get_missings {
2446         # Returns the stored up @missings lines' values, and clears the list.
2447         # The values are in an array, consisting of the default in the first
2448         # element, and the property in the 2nd.  However, since these lines
2449         # can be stacked up, the return is an array of all these arrays.
2450
2451         my $self = shift;
2452         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2453
2454         my $addr = do { no overloading; pack 'J', $self; };
2455
2456         # If not accepting a list return, just return the first one.
2457         return shift @{$missings{$addr}} unless wantarray;
2458
2459         my @return = @{$missings{$addr}};
2460         undef @{$missings{$addr}};
2461         return @return;
2462     }
2463
2464     sub _insert_property_into_line {
2465         # Add a property field to $_, if this file requires it.
2466
2467         my $self = shift;
2468         my $addr = do { no overloading; pack 'J', $self; };
2469         my $property = $property{$addr};
2470         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2471
2472         $_ =~ s/(;|$)/; $property$1/;
2473         return;
2474     }
2475
2476     sub carp_bad_line {
2477         # Output consistent error messages, using either a generic one, or the
2478         # one given by the optional parameter.  To avoid gazillions of the
2479         # same message in case the syntax of a  file is way off, this routine
2480         # only outputs the first instance of each message, incrementing a
2481         # count so the totals can be output at the end of the file.
2482
2483         my $self = shift;
2484         my $message = shift;
2485         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2486
2487         my $addr = do { no overloading; pack 'J', $self; };
2488
2489         $message = 'Unexpected line' unless $message;
2490
2491         # No trailing punctuation so as to fit with our addenda.
2492         $message =~ s/[.:;,]$//;
2493
2494         # If haven't seen this exact message before, output it now.  Otherwise
2495         # increment the count of how many times it has occurred
2496         unless ($errors{$addr}->{$message}) {
2497             Carp::my_carp("$message in '$_' in "
2498                             . $file{$addr}
2499                             . " at line $..  Skipping this line;");
2500             $errors{$addr}->{$message} = 1;
2501         }
2502         else {
2503             $errors{$addr}->{$message}++;
2504         }
2505
2506         # Clear the line to prevent any further (meaningful) processing of it.
2507         $_ = "";
2508
2509         return;
2510     }
2511 } # End closure
2512
2513 package Multi_Default;
2514
2515 # Certain properties in early versions of Unicode had more than one possible
2516 # default for code points missing from the files.  In these cases, one
2517 # default applies to everything left over after all the others are applied,
2518 # and for each of the others, there is a description of which class of code
2519 # points applies to it.  This object helps implement this by storing the
2520 # defaults, and for all but that final default, an eval string that generates
2521 # the class that it applies to.
2522
2523
2524 {   # Closure
2525
2526     main::setup_package();
2527
2528     my %class_defaults;
2529     # The defaults structure for the classes
2530     main::set_access('class_defaults', \%class_defaults);
2531
2532     my %other_default;
2533     # The default that applies to everything left over.
2534     main::set_access('other_default', \%other_default, 'r');
2535
2536
2537     sub new {
2538         # The constructor is called with default => eval pairs, terminated by
2539         # the left-over default. e.g.
2540         # Multi_Default->new(
2541         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2542         #               -  0x200D',
2543         #        'R' => 'some other expression that evaluates to code points',
2544         #        .
2545         #        .
2546         #        .
2547         #        'U'));
2548
2549         my $class = shift;
2550
2551         my $self = bless \do{my $anonymous_scalar}, $class;
2552         my $addr = do { no overloading; pack 'J', $self; };
2553
2554         while (@_ > 1) {
2555             my $default = shift;
2556             my $eval = shift;
2557             $class_defaults{$addr}->{$default} = $eval;
2558         }
2559
2560         $other_default{$addr} = shift;
2561
2562         return $self;
2563     }
2564
2565     sub get_next_defaults {
2566         # Iterates and returns the next class of defaults.
2567         my $self = shift;
2568         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2569
2570         my $addr = do { no overloading; pack 'J', $self; };
2571
2572         return each %{$class_defaults{$addr}};
2573     }
2574 }
2575
2576 package Alias;
2577
2578 # An alias is one of the names that a table goes by.  This class defines them
2579 # including some attributes.  Everything is currently setup in the
2580 # constructor.
2581
2582
2583 {   # Closure
2584
2585     main::setup_package();
2586
2587     my %name;
2588     main::set_access('name', \%name, 'r');
2589
2590     my %loose_match;
2591     # Determined by the constructor code if this name should match loosely or
2592     # not.  The constructor parameters can override this, but it isn't fully
2593     # implemented, as should have ability to override Unicode one's via
2594     # something like a set_loose_match()
2595     main::set_access('loose_match', \%loose_match, 'r');
2596
2597     my %make_pod_entry;
2598     # Some aliases should not get their own entries because they are covered
2599     # by a wild-card, and some we want to discourage use of.  Binary
2600     main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2601
2602     my %status;
2603     # Aliases have a status, like deprecated, or even suppressed (which means
2604     # they don't appear in documentation).  Enum
2605     main::set_access('status', \%status, 'r');
2606
2607     my %externally_ok;
2608     # Similarly, some aliases should not be considered as usable ones for
2609     # external use, such as file names, or we don't want documentation to
2610     # recommend them.  Boolean
2611     main::set_access('externally_ok', \%externally_ok, 'r');
2612
2613     sub new {
2614         my $class = shift;
2615
2616         my $self = bless \do { my $anonymous_scalar }, $class;
2617         my $addr = do { no overloading; pack 'J', $self; };
2618
2619         $name{$addr} = shift;
2620         $loose_match{$addr} = shift;
2621         $make_pod_entry{$addr} = shift;
2622         $externally_ok{$addr} = shift;
2623         $status{$addr} = shift;
2624
2625         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2626
2627         # Null names are never ok externally
2628         $externally_ok{$addr} = 0 if $name{$addr} eq "";
2629
2630         return $self;
2631     }
2632 }
2633
2634 package Range;
2635
2636 # A range is the basic unit for storing code points, and is described in the
2637 # comments at the beginning of the program.  Each range has a starting code
2638 # point; an ending code point (not less than the starting one); a value
2639 # that applies to every code point in between the two end-points, inclusive;
2640 # and an enum type that applies to the value.  The type is for the user's
2641 # convenience, and has no meaning here, except that a non-zero type is
2642 # considered to not obey the normal Unicode rules for having standard forms.
2643 #
2644 # The same structure is used for both map and match tables, even though in the
2645 # latter, the value (and hence type) is irrelevant and could be used as a
2646 # comment.  In map tables, the value is what all the code points in the range
2647 # map to.  Type 0 values have the standardized version of the value stored as
2648 # well, so as to not have to recalculate it a lot.
2649
2650 sub trace { return main::trace(@_); }
2651
2652 {   # Closure
2653
2654     main::setup_package();
2655
2656     my %start;
2657     main::set_access('start', \%start, 'r', 's');
2658
2659     my %end;
2660     main::set_access('end', \%end, 'r', 's');
2661
2662     my %value;
2663     main::set_access('value', \%value, 'r');
2664
2665     my %type;
2666     main::set_access('type', \%type, 'r');
2667
2668     my %standard_form;
2669     # The value in internal standard form.  Defined only if the type is 0.
2670     main::set_access('standard_form', \%standard_form);
2671
2672     # Note that if these fields change, the dump() method should as well
2673
2674     sub new {
2675         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2676         my $class = shift;
2677
2678         my $self = bless \do { my $anonymous_scalar }, $class;
2679         my $addr = do { no overloading; pack 'J', $self; };
2680
2681         $start{$addr} = shift;
2682         $end{$addr} = shift;
2683
2684         my %args = @_;
2685
2686         my $value = delete $args{'Value'};  # Can be 0
2687         $value = "" unless defined $value;
2688         $value{$addr} = $value;
2689
2690         $type{$addr} = delete $args{'Type'} || 0;
2691
2692         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2693
2694         if (! $type{$addr}) {
2695             $standard_form{$addr} = main::standardize($value);
2696         }
2697
2698         return $self;
2699     }
2700
2701     use overload
2702         fallback => 0,
2703         qw("") => "_operator_stringify",
2704         "." => \&main::_operator_dot,
2705     ;
2706
2707     sub _operator_stringify {
2708         my $self = shift;
2709         my $addr = do { no overloading; pack 'J', $self; };
2710
2711         # Output it like '0041..0065 (value)'
2712         my $return = sprintf("%04X", $start{$addr})
2713                         .  '..'
2714                         . sprintf("%04X", $end{$addr});
2715         my $value = $value{$addr};
2716         my $type = $type{$addr};
2717         $return .= ' (';
2718         $return .= "$value";
2719         $return .= ", Type=$type" if $type != 0;
2720         $return .= ')';
2721
2722         return $return;
2723     }
2724
2725     sub standard_form {
2726         # The standard form is the value itself if the standard form is
2727         # undefined (that is if the value is special)
2728
2729         my $self = shift;
2730         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2731
2732         my $addr = do { no overloading; pack 'J', $self; };
2733
2734         return $standard_form{$addr} if defined $standard_form{$addr};
2735         return $value{$addr};
2736     }
2737
2738     sub dump {
2739         # Human, not machine readable.  For machine readable, comment out this
2740         # entire routine and let the standard one take effect.
2741         my $self = shift;
2742         my $indent = shift;
2743         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2744
2745         my $addr = do { no overloading; pack 'J', $self; };
2746
2747         my $return = $indent
2748                     . sprintf("%04X", $start{$addr})
2749                     . '..'
2750                     . sprintf("%04X", $end{$addr})
2751                     . " '$value{$addr}';";
2752         if (! defined $standard_form{$addr}) {
2753             $return .= "(type=$type{$addr})";
2754         }
2755         elsif ($standard_form{$addr} ne $value{$addr}) {
2756             $return .= "(standard '$standard_form{$addr}')";
2757         }
2758         return $return;
2759     }
2760 } # End closure
2761
2762 package _Range_List_Base;
2763
2764 # Base class for range lists.  A range list is simply an ordered list of
2765 # ranges, so that the ranges with the lowest starting numbers are first in it.
2766 #
2767 # When a new range is added that is adjacent to an existing range that has the
2768 # same value and type, it merges with it to form a larger range.
2769 #
2770 # Ranges generally do not overlap, except that there can be multiple entries
2771 # of single code point ranges.  This is because of NameAliases.txt.
2772 #
2773 # In this program, there is a standard value such that if two different
2774 # values, have the same standard value, they are considered equivalent.  This
2775 # value was chosen so that it gives correct results on Unicode data
2776
2777 # There are a number of methods to manipulate range lists, and some operators
2778 # are overloaded to handle them.
2779
2780 sub trace { return main::trace(@_); }
2781
2782 { # Closure
2783
2784     our $addr;
2785
2786     main::setup_package();
2787
2788     my %ranges;
2789     # The list of ranges
2790     main::set_access('ranges', \%ranges, 'readable_array');
2791
2792     my %max;
2793     # The highest code point in the list.  This was originally a method, but
2794     # actual measurements said it was used a lot.
2795     main::set_access('max', \%max, 'r');
2796
2797     my %each_range_iterator;
2798     # Iterator position for each_range()
2799     main::set_access('each_range_iterator', \%each_range_iterator);
2800
2801     my %owner_name_of;
2802     # Name of parent this is attached to, if any.  Solely for better error
2803     # messages.
2804     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2805
2806     my %_search_ranges_cache;
2807     # A cache of the previous result from _search_ranges(), for better
2808     # performance
2809     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2810
2811     sub new {
2812         my $class = shift;
2813         my %args = @_;
2814
2815         # Optional initialization data for the range list.
2816         my $initialize = delete $args{'Initialize'};
2817
2818         my $self;
2819
2820         # Use _union() to initialize.  _union() returns an object of this
2821         # class, which means that it will call this constructor recursively.
2822         # But it won't have this $initialize parameter so that it won't
2823         # infinitely loop on this.
2824         return _union($class, $initialize, %args) if defined $initialize;
2825
2826         $self = bless \do { my $anonymous_scalar }, $class;
2827         my $addr = do { no overloading; pack 'J', $self; };
2828
2829         # Optional parent object, only for debug info.
2830         $owner_name_of{$addr} = delete $args{'Owner'};
2831         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2832
2833         # Stringify, in case it is an object.
2834         $owner_name_of{$addr} = "$owner_name_of{$addr}";
2835
2836         # This is used only for error messages, and so a colon is added
2837         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2838
2839         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2840
2841         # Max is initialized to a negative value that isn't adjacent to 0,
2842         # for simpler tests
2843         $max{$addr} = -2;
2844
2845         $_search_ranges_cache{$addr} = 0;
2846         $ranges{$addr} = [];
2847
2848         return $self;
2849     }
2850
2851     use overload
2852         fallback => 0,
2853         qw("") => "_operator_stringify",
2854         "." => \&main::_operator_dot,
2855     ;
2856
2857     sub _operator_stringify {
2858         my $self = shift;
2859         my $addr = do { no overloading; pack 'J', $self; };
2860
2861         return "Range_List attached to '$owner_name_of{$addr}'"
2862                                                 if $owner_name_of{$addr};
2863         return "anonymous Range_List " . \$self;
2864     }
2865
2866     sub _union {
2867         # Returns the union of the input code points.  It can be called as
2868         # either a constructor or a method.  If called as a method, the result
2869         # will be a new() instance of the calling object, containing the union
2870         # of that object with the other parameter's code points;  if called as
2871         # a constructor, the first parameter gives the class the new object
2872         # should be, and the second parameter gives the code points to go into
2873         # it.
2874         # In either case, there are two parameters looked at by this routine;
2875         # any additional parameters are passed to the new() constructor.
2876         #
2877         # The code points can come in the form of some object that contains
2878         # ranges, and has a conventionally named method to access them; or
2879         # they can be an array of individual code points (as integers); or
2880         # just a single code point.
2881         #
2882         # If they are ranges, this routine doesn't make any effort to preserve
2883         # the range values of one input over the other.  Therefore this base
2884         # class should not allow _union to be called from other than
2885         # initialization code, so as to prevent two tables from being added
2886         # together where the range values matter.  The general form of this
2887         # routine therefore belongs in a derived class, but it was moved here
2888         # to avoid duplication of code.  The failure to overload this in this
2889         # class keeps it safe.
2890         #
2891
2892         my $self;
2893         my @args;   # Arguments to pass to the constructor
2894
2895         my $class = shift;
2896
2897         # If a method call, will start the union with the object itself, and
2898         # the class of the new object will be the same as self.
2899         if (ref $class) {
2900             $self = $class;
2901             $class = ref $self;
2902             push @args, $self;
2903         }
2904
2905         # Add the other required parameter.
2906         push @args, shift;
2907         # Rest of parameters are passed on to the constructor
2908
2909         # Accumulate all records from both lists.
2910         my @records;
2911         for my $arg (@args) {
2912             #local $to_trace = 0 if main::DEBUG;
2913             trace "argument = $arg" if main::DEBUG && $to_trace;
2914             if (! defined $arg) {
2915                 my $message = "";
2916                 if (defined $self) {
2917                     no overloading;
2918                     $message .= $owner_name_of{pack 'J', $self};
2919                 }
2920                 Carp::my_carp_bug($message .= "Undefined argument to _union.  No union done.");
2921                 return;
2922             }
2923             $arg = [ $arg ] if ! ref $arg;
2924             my $type = ref $arg;
2925             if ($type eq 'ARRAY') {
2926                 foreach my $element (@$arg) {
2927                     push @records, Range->new($element, $element);
2928                 }
2929             }
2930             elsif ($arg->isa('Range')) {
2931                 push @records, $arg;
2932             }
2933             elsif ($arg->can('ranges')) {
2934                 push @records, $arg->ranges;
2935             }
2936             else {
2937                 my $message = "";
2938                 if (defined $self) {
2939                     no overloading;
2940                     $message .= $owner_name_of{pack 'J', $self};
2941                 }
2942                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
2943                 return;
2944             }
2945         }
2946
2947         # Sort with the range containing the lowest ordinal first, but if
2948         # two ranges start at the same code point, sort with the bigger range
2949         # of the two first, because it takes fewer cycles.
2950         @records = sort { ($a->start <=> $b->start)
2951                                       or
2952                                     # if b is shorter than a, b->end will be
2953                                     # less than a->end, and we want to select
2954                                     # a, so want to return -1
2955                                     ($b->end <=> $a->end)
2956                                    } @records;
2957
2958         my $new = $class->new(@_);
2959
2960         # Fold in records so long as they add new information.
2961         for my $set (@records) {
2962             my $start = $set->start;
2963             my $end   = $set->end;
2964             my $value   = $set->value;
2965             if ($start > $new->max) {
2966                 $new->_add_delete('+', $start, $end, $value);
2967             }
2968             elsif ($end > $new->max) {
2969                 $new->_add_delete('+', $new->max +1, $end, $value);
2970             }
2971         }
2972
2973         return $new;
2974     }
2975
2976     sub range_count {        # Return the number of ranges in the range list
2977         my $self = shift;
2978         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2979
2980         no overloading;
2981         return scalar @{$ranges{pack 'J', $self}};
2982     }
2983
2984     sub min {
2985         # Returns the minimum code point currently in the range list, or if
2986         # the range list is empty, 2 beyond the max possible.  This is a
2987         # method because used so rarely, that not worth saving between calls,
2988         # and having to worry about changing it as ranges are added and
2989         # deleted.
2990
2991         my $self = shift;
2992         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2993
2994         my $addr = do { no overloading; pack 'J', $self; };
2995
2996         # If the range list is empty, return a large value that isn't adjacent
2997         # to any that could be in the range list, for simpler tests
2998         return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
2999         return $ranges{$addr}->[0]->start;
3000     }
3001
3002     sub contains {
3003         # Boolean: Is argument in the range list?  If so returns $i such that:
3004         #   range[$i]->end < $codepoint <= range[$i+1]->end
3005         # which is one beyond what you want; this is so that the 0th range
3006         # doesn't return false
3007         my $self = shift;
3008         my $codepoint = shift;
3009         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3010
3011         my $i = $self->_search_ranges($codepoint);
3012         return 0 unless defined $i;
3013
3014         # The search returns $i, such that
3015         #   range[$i-1]->end < $codepoint <= range[$i]->end
3016         # So is in the table if and only iff it is at least the start position
3017         # of range $i.
3018         no overloading;
3019         return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3020         return $i + 1;
3021     }
3022
3023     sub containing_range {
3024         # Returns the range object that contains the code point, undef if none
3025
3026         my $self = shift;
3027         my $codepoint = shift;
3028         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3029
3030         my $i = $self->contains($codepoint);
3031         return unless $i;
3032
3033         # contains() returns 1 beyond where we should look
3034         no overloading;
3035         return $ranges{pack 'J', $self}->[$i-1];
3036     }
3037
3038     sub value_of {
3039         # Returns the value associated with the code point, undef if none
3040
3041         my $self = shift;
3042         my $codepoint = shift;
3043         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3044
3045         my $range = $self->containing_range($codepoint);
3046         return unless defined $range;
3047
3048         return $range->value;
3049     }
3050
3051     sub type_of {
3052         # Returns the type of the range containing the code point, undef if
3053         # the code point is not in the table
3054
3055         my $self = shift;
3056         my $codepoint = shift;
3057         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3058
3059         my $range = $self->containing_range($codepoint);
3060         return unless defined $range;
3061
3062         return $range->type;
3063     }
3064
3065     sub _search_ranges {
3066         # Find the range in the list which contains a code point, or where it
3067         # should go if were to add it.  That is, it returns $i, such that:
3068         #   range[$i-1]->end < $codepoint <= range[$i]->end
3069         # Returns undef if no such $i is possible (e.g. at end of table), or
3070         # if there is an error.
3071
3072         my $self = shift;
3073         my $code_point = shift;
3074         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3075
3076         my $addr = do { no overloading; pack 'J', $self; };
3077
3078         return if $code_point > $max{$addr};
3079         my $r = $ranges{$addr};                # The current list of ranges
3080         my $range_list_size = scalar @$r;
3081         my $i;
3082
3083         use integer;        # want integer division
3084
3085         # Use the cached result as the starting guess for this one, because,
3086         # an experiment on 5.1 showed that 90% of the time the cache was the
3087         # same as the result on the next call (and 7% it was one less).
3088         $i = $_search_ranges_cache{$addr};
3089         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3090                                             # from an intervening deletion
3091         #local $to_trace = 1 if main::DEBUG;
3092         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3093         return $i if $code_point <= $r->[$i]->end
3094                      && ($i == 0 || $r->[$i-1]->end < $code_point);
3095
3096         # Here the cache doesn't yield the correct $i.  Try adding 1.
3097         if ($i < $range_list_size - 1
3098             && $r->[$i]->end < $code_point &&
3099             $code_point <= $r->[$i+1]->end)
3100         {
3101             $i++;
3102             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3103             $_search_ranges_cache{$addr} = $i;
3104             return $i;
3105         }
3106
3107         # Here, adding 1 also didn't work.  We do a binary search to
3108         # find the correct position, starting with current $i
3109         my $lower = 0;
3110         my $upper = $range_list_size - 1;
3111         while (1) {
3112             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3113
3114             if ($code_point <= $r->[$i]->end) {
3115
3116                 # Here we have met the upper constraint.  We can quit if we
3117                 # also meet the lower one.
3118                 last if $i == 0 || $r->[$i-1]->end < $code_point;
3119
3120                 $upper = $i;        # Still too high.
3121
3122             }
3123             else {
3124
3125                 # Here, $r[$i]->end < $code_point, so look higher up.
3126                 $lower = $i;
3127             }
3128
3129             # Split search domain in half to try again.
3130             my $temp = ($upper + $lower) / 2;
3131
3132             # No point in continuing unless $i changes for next time
3133             # in the loop.
3134             if ($temp == $i) {
3135
3136                 # We can't reach the highest element because of the averaging.
3137                 # So if one below the upper edge, force it there and try one
3138                 # more time.
3139                 if ($i == $range_list_size - 2) {
3140
3141                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3142                     $i = $range_list_size - 1;
3143
3144                     # Change $lower as well so if fails next time through,
3145                     # taking the average will yield the same $i, and we will
3146                     # quit with the error message just below.
3147                     $lower = $i;
3148                     next;
3149                 }
3150                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
3151                 return;
3152             }
3153             $i = $temp;
3154         } # End of while loop
3155
3156         if (main::DEBUG && $to_trace) {
3157             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3158             trace "i=  [ $i ]", $r->[$i];
3159             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3160         }
3161
3162         # Here we have found the offset.  Cache it as a starting point for the
3163         # next call.
3164         $_search_ranges_cache{$addr} = $i;
3165         return $i;
3166     }
3167
3168     sub _add_delete {
3169         # Add, replace or delete ranges to or from a list.  The $type
3170         # parameter gives which:
3171         #   '+' => insert or replace a range, returning a list of any changed
3172         #          ranges.
3173         #   '-' => delete a range, returning a list of any deleted ranges.
3174         #
3175         # The next three parameters give respectively the start, end, and
3176         # value associated with the range.  'value' should be null unless the
3177         # operation is '+';
3178         #
3179         # The range list is kept sorted so that the range with the lowest
3180         # starting position is first in the list, and generally, adjacent
3181         # ranges with the same values are merged into a single larger one (see
3182         # exceptions below).
3183         #
3184         # There are more parameters; all are key => value pairs:
3185         #   Type    gives the type of the value.  It is only valid for '+'.
3186         #           All ranges have types; if this parameter is omitted, 0 is
3187         #           assumed.  Ranges with type 0 are assumed to obey the
3188         #           Unicode rules for casing, etc; ranges with other types are
3189         #           not.  Otherwise, the type is arbitrary, for the caller's
3190         #           convenience, and looked at only by this routine to keep
3191         #           adjacent ranges of different types from being merged into
3192         #           a single larger range, and when Replace =>
3193         #           $IF_NOT_EQUIVALENT is specified (see just below).
3194         #   Replace  determines what to do if the range list already contains
3195         #            ranges which coincide with all or portions of the input
3196         #            range.  It is only valid for '+':
3197         #       => $NO            means that the new value is not to replace
3198         #                         any existing ones, but any empty gaps of the
3199         #                         range list coinciding with the input range
3200         #                         will be filled in with the new value.
3201         #       => $UNCONDITIONALLY  means to replace the existing values with
3202         #                         this one unconditionally.  However, if the
3203         #                         new and old values are identical, the
3204         #                         replacement is skipped to save cycles
3205         #       => $IF_NOT_EQUIVALENT means to replace the existing values
3206         #                         with this one if they are not equivalent.
3207         #                         Ranges are equivalent if their types are the
3208         #                         same, and they are the same string; or if
3209         #                         both are type 0 ranges, if their Unicode
3210         #                         standard forms are identical.  In this last
3211         #                         case, the routine chooses the more "modern"
3212         #                         one to use.  This is because some of the
3213         #                         older files are formatted with values that
3214         #                         are, for example, ALL CAPs, whereas the
3215         #                         derived files have a more modern style,
3216         #                         which looks better.  By looking for this
3217         #                         style when the pre-existing and replacement
3218         #                         standard forms are the same, we can move to
3219         #                         the modern style
3220         #       => $MULTIPLE      means that if this range duplicates an
3221         #                         existing one, but has a different value,
3222         #                         don't replace the existing one, but insert
3223         #                         this, one so that the same range can occur
3224         #                         multiple times.  They are stored LIFO, so
3225         #                         that the final one inserted is the first one
3226         #                         returned in an ordered search of the table.
3227         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
3228         #
3229         # "same value" means identical for non-type-0 ranges, and it means
3230         # having the same standard forms for type-0 ranges.
3231
3232         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3233
3234         my $self = shift;
3235         my $operation = shift;   # '+' for add/replace; '-' for delete;
3236         my $start = shift;
3237         my $end   = shift;
3238         my $value = shift;
3239
3240         my %args = @_;
3241
3242         $value = "" if not defined $value;        # warning: $value can be "0"
3243
3244         my $replace = delete $args{'Replace'};
3245         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3246
3247         my $type = delete $args{'Type'};
3248         $type = 0 unless defined $type;
3249
3250         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3251
3252         my $addr = do { no overloading; pack 'J', $self; };
3253
3254         if ($operation ne '+' && $operation ne '-') {
3255             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
3256             return;
3257         }
3258         unless (defined $start && defined $end) {
3259             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
3260             return;
3261         }
3262         unless ($end >= $start) {
3263             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
3264             return;
3265         }
3266         #local $to_trace = 1 if main::DEBUG;
3267
3268         if ($operation eq '-') {
3269             if ($replace != $IF_NOT_EQUIVALENT) {
3270                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
3271                 $replace = $IF_NOT_EQUIVALENT;
3272             }
3273             if ($type) {
3274                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
3275                 $type = 0;
3276             }
3277             if ($value ne "") {
3278                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
3279                 $value = "";
3280             }
3281         }
3282
3283         my $r = $ranges{$addr};               # The current list of ranges
3284         my $range_list_size = scalar @$r;     # And its size
3285         my $max = $max{$addr};                # The current high code point in
3286                                               # the list of ranges
3287
3288         # Do a special case requiring fewer machine cycles when the new range
3289         # starts after the current highest point.  The Unicode input data is
3290         # structured so this is common.
3291         if ($start > $max) {
3292
3293             trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3294             return if $operation eq '-'; # Deleting a non-existing range is a
3295                                          # no-op
3296
3297             # If the new range doesn't logically extend the current final one
3298             # in the range list, create a new range at the end of the range
3299             # list.  (max cleverly is initialized to a negative number not
3300             # adjacent to 0 if the range list is empty, so even adding a range
3301             # to an empty range list starting at 0 will have this 'if'
3302             # succeed.)
3303             if ($start > $max + 1        # non-adjacent means can't extend.
3304                 || @{$r}[-1]->value ne $value # values differ, can't extend.
3305                 || @{$r}[-1]->type != $type # types differ, can't extend.
3306             ) {
3307                 push @$r, Range->new($start, $end,
3308                                      Value => $value,
3309                                      Type => $type);
3310             }
3311             else {
3312
3313                 # Here, the new range starts just after the current highest in
3314                 # the range list, and they have the same type and value.
3315                 # Extend the current range to incorporate the new one.
3316                 @{$r}[-1]->set_end($end);
3317             }
3318
3319             # This becomes the new maximum.
3320             $max{$addr} = $end;
3321
3322             return;
3323         }
3324         #local $to_trace = 0 if main::DEBUG;
3325
3326         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3327
3328         # Here, the input range isn't after the whole rest of the range list.
3329         # Most likely 'splice' will be needed.  The rest of the routine finds
3330         # the needed splice parameters, and if necessary, does the splice.
3331         # First, find the offset parameter needed by the splice function for
3332         # the input range.  Note that the input range may span multiple
3333         # existing ones, but we'll worry about that later.  For now, just find
3334         # the beginning.  If the input range is to be inserted starting in a
3335         # position not currently in the range list, it must (obviously) come
3336         # just after the range below it, and just before the range above it.
3337         # Slightly less obviously, it will occupy the position currently
3338         # occupied by the range that is to come after it.  More formally, we
3339         # are looking for the position, $i, in the array of ranges, such that:
3340         #
3341         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3342         #
3343         # (The ordered relationships within existing ranges are also shown in
3344         # the equation above).  However, if the start of the input range is
3345         # within an existing range, the splice offset should point to that
3346         # existing range's position in the list; that is $i satisfies a
3347         # somewhat different equation, namely:
3348         #
3349         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3350         #
3351         # More briefly, $start can come before or after r[$i]->start, and at
3352         # this point, we don't know which it will be.  However, these
3353         # two equations share these constraints:
3354         #
3355         #   r[$i-1]->end < $start <= r[$i]->end
3356         #
3357         # And that is good enough to find $i.
3358
3359         my $i = $self->_search_ranges($start);
3360         if (! defined $i) {
3361             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
3362             return;
3363         }
3364
3365         # The search function returns $i such that:
3366         #
3367         # r[$i-1]->end < $start <= r[$i]->end
3368         #
3369         # That means that $i points to the first range in the range list
3370         # that could possibly be affected by this operation.  We still don't
3371         # know if the start of the input range is within r[$i], or if it
3372         # points to empty space between r[$i-1] and r[$i].
3373         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3374
3375         # Special case the insertion of data that is not to replace any
3376         # existing data.
3377         if ($replace == $NO) {  # If $NO, has to be operation '+'
3378             #local $to_trace = 1 if main::DEBUG;
3379             trace "Doesn't replace" if main::DEBUG && $to_trace;
3380
3381             # Here, the new range is to take effect only on those code points
3382             # that aren't already in an existing range.  This can be done by
3383             # looking through the existing range list and finding the gaps in
3384             # the ranges that this new range affects, and then calling this
3385             # function recursively on each of those gaps, leaving untouched
3386             # anything already in the list.  Gather up a list of the changed
3387             # gaps first so that changes to the internal state as new ranges
3388             # are added won't be a problem.
3389             my @gap_list;
3390
3391             # First, if the starting point of the input range is outside an
3392             # existing one, there is a gap from there to the beginning of the
3393             # existing range -- add a span to fill the part that this new
3394             # range occupies
3395             if ($start < $r->[$i]->start) {
3396                 push @gap_list, Range->new($start,
3397                                            main::min($end,
3398                                                      $r->[$i]->start - 1),
3399                                            Type => $type);
3400                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3401             }
3402
3403             # Then look through the range list for other gaps until we reach
3404             # the highest range affected by the input one.
3405             my $j;
3406             for ($j = $i+1; $j < $range_list_size; $j++) {
3407                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3408                 last if $end < $r->[$j]->start;
3409
3410                 # If there is a gap between when this range starts and the
3411                 # previous one ends, add a span to fill it.  Note that just
3412                 # because there are two ranges doesn't mean there is a
3413                 # non-zero gap between them.  It could be that they have
3414                 # different values or types
3415                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3416                     push @gap_list,
3417                         Range->new($r->[$j-1]->end + 1,
3418                                    $r->[$j]->start - 1,
3419                                    Type => $type);
3420                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3421                 }
3422             }
3423
3424             # Here, we have either found an existing range in the range list,
3425             # beyond the area affected by the input one, or we fell off the
3426             # end of the loop because the input range affects the whole rest
3427             # of the range list.  In either case, $j is 1 higher than the
3428             # highest affected range.  If $j == $i, it means that there are no
3429             # affected ranges, that the entire insertion is in the gap between
3430             # r[$i-1], and r[$i], which we already have taken care of before
3431             # the loop.
3432             # On the other hand, if there are affected ranges, it might be
3433             # that there is a gap that needs filling after the final such
3434             # range to the end of the input range
3435             if ($r->[$j-1]->end < $end) {
3436                     push @gap_list, Range->new(main::max($start,
3437                                                          $r->[$j-1]->end + 1),
3438                                                $end,
3439                                                Type => $type);
3440                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3441             }
3442
3443             # Call recursively to fill in all the gaps.
3444             foreach my $gap (@gap_list) {
3445                 $self->_add_delete($operation,
3446                                    $gap->start,
3447                                    $gap->end,
3448                                    $value,
3449                                    Type => $type);
3450             }
3451
3452             return;
3453         }
3454
3455         # Here, we have taken care of the case where $replace is $NO.
3456         # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3457         # If inserting a multiple record, this is where it goes, before the
3458         # first (if any) existing one.  This implies an insertion, and no
3459         # change to any existing ranges.  Note that $i can be -1 if this new
3460         # range doesn't actually duplicate any existing, and comes at the
3461         # beginning of the list.
3462         if ($replace == $MULTIPLE) {
3463
3464             if ($start != $end) {
3465                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
3466                 return;
3467             }
3468
3469             # Don't add an exact duplicate, as it isn't really a multiple
3470             if ($end >= $r->[$i]->start) {
3471                 if ($r->[$i]->start != $r->[$i]->end) {
3472                     Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the other range ($r->[$i]) contains more than one code point.  No action taken.");
3473                     return;
3474                 }
3475                 return if $value eq $r->[$i]->value && $type eq $r->[$i]->type;
3476             }
3477
3478             trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3479             my @return = splice @$r,
3480                                 $i,
3481                                 0,
3482                                 Range->new($start,
3483                                            $end,
3484                                            Value => $value,
3485                                            Type => $type);
3486             if (main::DEBUG && $to_trace) {
3487                 trace "After splice:";
3488                 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3489                 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3490                 trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
3491                 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3492                 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3493                 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3494             }
3495             return @return;
3496         }
3497
3498         # Here, we have taken care of $NO and $MULTIPLE replaces.  This leaves
3499         # delete, insert, and replace either unconditionally or if not
3500         # equivalent.  $i still points to the first potential affected range.
3501         # Now find the highest range affected, which will determine the length
3502         # parameter to splice.  (The input range can span multiple existing
3503         # ones.)  If this isn't a deletion, while we are looking through the
3504         # range list, see also if this is a replacement rather than a clean
3505         # insertion; that is if it will change the values of at least one
3506         # existing range.  Start off assuming it is an insert, until find it
3507         # isn't.
3508         my $clean_insert = $operation eq '+';
3509         my $j;        # This will point to the highest affected range
3510
3511         # For non-zero types, the standard form is the value itself;
3512         my $standard_form = ($type) ? $value : main::standardize($value);
3513
3514         for ($j = $i; $j < $range_list_size; $j++) {
3515             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3516
3517             # If find a range that it doesn't overlap into, we can stop
3518             # searching
3519             last if $end < $r->[$j]->start;
3520
3521             # Here, overlaps the range at $j.  If the values don't match,
3522             # and so far we think this is a clean insertion, it becomes a
3523             # non-clean insertion, i.e., a 'change' or 'replace' instead.
3524             if ($clean_insert) {
3525                 if ($r->[$j]->standard_form ne $standard_form) {
3526                     $clean_insert = 0;
3527                     if ($replace == $CROAK) {
3528                         main::croak("The range to add "
3529                         . sprintf("%04X", $start)
3530                         . '-'
3531                         . sprintf("%04X", $end)
3532                         . " with value '$value' overlaps an existing range $r->[$j]");
3533                     }
3534                 }
3535                 else {
3536
3537                     # Here, the two values are essentially the same.  If the
3538                     # two are actually identical, replacing wouldn't change
3539                     # anything so skip it.
3540                     my $pre_existing = $r->[$j]->value;
3541                     if ($pre_existing ne $value) {
3542
3543                         # Here the new and old standardized values are the
3544                         # same, but the non-standardized values aren't.  If
3545                         # replacing unconditionally, then replace
3546                         if( $replace == $UNCONDITIONALLY) {
3547                             $clean_insert = 0;
3548                         }
3549                         else {
3550
3551                             # Here, are replacing conditionally.  Decide to
3552                             # replace or not based on which appears to look
3553                             # the "nicest".  If one is mixed case and the
3554                             # other isn't, choose the mixed case one.
3555                             my $new_mixed = $value =~ /[A-Z]/
3556                                             && $value =~ /[a-z]/;
3557                             my $old_mixed = $pre_existing =~ /[A-Z]/
3558                                             && $pre_existing =~ /[a-z]/;
3559
3560                             if ($old_mixed != $new_mixed) {
3561                                 $clean_insert = 0 if $new_mixed;
3562                                 if (main::DEBUG && $to_trace) {
3563                                     if ($clean_insert) {
3564                                         trace "Retaining $pre_existing over $value";
3565                                     }
3566                                     else {
3567                                         trace "Replacing $pre_existing with $value";
3568                                     }
3569                                 }
3570                             }
3571                             else {
3572
3573                                 # Here casing wasn't different between the two.
3574                                 # If one has hyphens or underscores and the
3575                                 # other doesn't, choose the one with the
3576                                 # punctuation.
3577                                 my $new_punct = $value =~ /[-_]/;
3578                                 my $old_punct = $pre_existing =~ /[-_]/;
3579
3580                                 if ($old_punct != $new_punct) {
3581                                     $clean_insert = 0 if $new_punct;
3582                                     if (main::DEBUG && $to_trace) {
3583                                         if ($clean_insert) {
3584                                             trace "Retaining $pre_existing over $value";
3585                                         }
3586                                         else {
3587                                             trace "Replacing $pre_existing with $value";
3588                                         }
3589                                     }
3590                                 }   # else existing one is just as "good";
3591                                     # retain it to save cycles.
3592                             }
3593                         }
3594                     }
3595                 }
3596             }
3597         } # End of loop looking for highest affected range.
3598
3599         # Here, $j points to one beyond the highest range that this insertion
3600         # affects (hence to beyond the range list if that range is the final
3601         # one in the range list).
3602
3603         # The splice length is all the affected ranges.  Get it before
3604         # subtracting, for efficiency, so we don't have to later add 1.
3605         my $length = $j - $i;
3606
3607         $j--;        # $j now points to the highest affected range.
3608         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3609
3610         # Here, have taken care of $NO and $MULTIPLE replaces.
3611         # $j points to the highest affected range.  But it can be < $i or even
3612         # -1.  These happen only if the insertion is entirely in the gap
3613         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
3614         # above exited first time through with $end < $r->[$i]->start.  (And
3615         # then we subtracted one from j)  This implies also that $start <
3616         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3617         # $start, so the entire input range is in the gap.
3618         if ($j < $i) {
3619
3620             # Here the entire input range is in the gap before $i.
3621
3622             if (main::DEBUG && $to_trace) {
3623                 if ($i) {
3624                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
3625                 }
3626                 else {
3627                     trace "Entire range is before $r->[$i]";
3628                 }
3629             }
3630             return if $operation ne '+'; # Deletion of a non-existent range is
3631                                          # a no-op
3632         }
3633         else {
3634
3635             # Here part of the input range is not in the gap before $i.  Thus,
3636             # there is at least one affected one, and $j points to the highest
3637             # such one.
3638
3639             # At this point, here is the situation:
3640             # This is not an insertion of a multiple, nor of tentative ($NO)
3641             # data.
3642             #   $i  points to the first element in the current range list that
3643             #            may be affected by this operation.  In fact, we know
3644             #            that the range at $i is affected because we are in
3645             #            the else branch of this 'if'
3646             #   $j  points to the highest affected range.
3647             # In other words,
3648             #   r[$i-1]->end < $start <= r[$i]->end
3649             # And:
3650             #   r[$i-1]->end < $start <= $end <= r[$j]->end
3651             #
3652             # Also:
3653             #   $clean_insert is a boolean which is set true if and only if
3654             #        this is a "clean insertion", i.e., not a change nor a
3655             #        deletion (multiple was handled above).
3656
3657             # We now have enough information to decide if this call is a no-op
3658             # or not.  It is a no-op if this is an insertion of already
3659             # existing data.
3660
3661             if (main::DEBUG && $to_trace && $clean_insert
3662                                          && $i == $j
3663                                          && $start >= $r->[$i]->start)
3664             {
3665                     trace "no-op";
3666             }
3667             return if $clean_insert
3668                       && $i == $j # more than one affected range => not no-op
3669
3670                       # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3671                       # Further, $start and/or $end is >= r[$i]->start
3672                       # The test below hence guarantees that
3673                       #     r[$i]->start < $start <= $end <= r[$i]->end
3674                       # This means the input range is contained entirely in
3675                       # the one at $i, so is a no-op
3676                       && $start >= $r->[$i]->start;
3677         }
3678
3679         # Here, we know that some action will have to be taken.  We have
3680         # calculated the offset and length (though adjustments may be needed)
3681         # for the splice.  Now start constructing the replacement list.
3682         my @replacement;
3683         my $splice_start = $i;
3684
3685         my $extends_below;
3686         my $extends_above;
3687
3688         # See if should extend any adjacent ranges.
3689         if ($operation eq '-') { # Don't extend deletions
3690             $extends_below = $extends_above = 0;
3691         }
3692         else {  # Here, should extend any adjacent ranges.  See if there are
3693                 # any.
3694             $extends_below = ($i > 0
3695                             # can't extend unless adjacent
3696                             && $r->[$i-1]->end == $start -1
3697                             # can't extend unless are same standard value
3698                             && $r->[$i-1]->standard_form eq $standard_form
3699                             # can't extend unless share type
3700                             && $r->[$i-1]->type == $type);
3701             $extends_above = ($j+1 < $range_list_size
3702                             && $r->[$j+1]->start == $end +1
3703                             && $r->[$j+1]->standard_form eq $standard_form
3704                             && $r->[$j+1]->type == $type);
3705         }
3706         if ($extends_below && $extends_above) { # Adds to both
3707             $splice_start--;     # start replace at element below
3708             $length += 2;        # will replace on both sides
3709             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3710
3711             # The result will fill in any gap, replacing both sides, and
3712             # create one large range.
3713             @replacement = Range->new($r->[$i-1]->start,
3714                                       $r->[$j+1]->end,
3715                                       Value => $value,
3716                                       Type => $type);
3717         }
3718         else {
3719
3720             # Here we know that the result won't just be the conglomeration of
3721             # a new range with both its adjacent neighbors.  But it could
3722             # extend one of them.
3723
3724             if ($extends_below) {
3725
3726                 # Here the new element adds to the one below, but not to the
3727                 # one above.  If inserting, and only to that one range,  can
3728                 # just change its ending to include the new one.
3729                 if ($length == 0 && $clean_insert) {
3730                     $r->[$i-1]->set_end($end);
3731                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3732                     return;
3733                 }
3734                 else {
3735                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3736                     $splice_start--;        # start replace at element below
3737                     $length++;              # will replace the element below
3738                     $start = $r->[$i-1]->start;
3739                 }
3740             }
3741             elsif ($extends_above) {
3742
3743                 # Here the new element adds to the one above, but not below.
3744                 # Mirror the code above
3745                 if ($length == 0 && $clean_insert) {
3746                     $r->[$j+1]->set_start($start);
3747                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3748                     return;
3749                 }
3750                 else {
3751                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3752                     $length++;        # will replace the element above
3753                     $end = $r->[$j+1]->end;
3754                 }
3755             }
3756
3757             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3758
3759             # Finally, here we know there will have to be a splice.
3760             # If the change or delete affects only the highest portion of the
3761             # first affected range, the range will have to be split.  The
3762             # splice will remove the whole range, but will replace it by a new
3763             # range containing just the unaffected part.  So, in this case,
3764             # add to the replacement list just this unaffected portion.
3765             if (! $extends_below
3766                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3767             {
3768                 push @replacement,
3769                     Range->new($r->[$i]->start,
3770                                $start - 1,
3771                                Value => $r->[$i]->value,
3772                                Type => $r->[$i]->type);
3773             }
3774
3775             # In the case of an insert or change, but not a delete, we have to
3776             # put in the new stuff;  this comes next.
3777             if ($operation eq '+') {
3778                 push @replacement, Range->new($start,
3779                                               $end,
3780                                               Value => $value,
3781                                               Type => $type);
3782             }
3783
3784             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3785             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3786
3787             # And finally, if we're changing or deleting only a portion of the
3788             # highest affected range, it must be split, as the lowest one was.
3789             if (! $extends_above
3790                 && $j >= 0  # Remember that j can be -1 if before first
3791                             # current element
3792                 && $end >= $r->[$j]->start
3793                 && $end < $r->[$j]->end)
3794             {
3795                 push @replacement,
3796                     Range->new($end + 1,
3797                                $r->[$j]->end,
3798                                Value => $r->[$j]->value,
3799                                Type => $r->[$j]->type);
3800             }
3801         }
3802
3803         # And do the splice, as calculated above
3804         if (main::DEBUG && $to_trace) {
3805             trace "replacing $length element(s) at $i with ";
3806             foreach my $replacement (@replacement) {
3807                 trace "    $replacement";
3808             }
3809             trace "Before splice:";
3810             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3811             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3812             trace "i  =[", $i, "]", $r->[$i];
3813             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3814             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3815         }
3816
3817         my @return = splice @$r, $splice_start, $length, @replacement;
3818
3819         if (main::DEBUG && $to_trace) {
3820             trace "After splice:";
3821             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3822             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3823             trace "i  =[", $i, "]", $r->[$i];
3824             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3825             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3826             trace "removed ", @return if @return;
3827         }
3828
3829         # An actual deletion could have changed the maximum in the list.
3830         # There was no deletion if the splice didn't return something, but
3831         # otherwise recalculate it.  This is done too rarely to worry about
3832         # performance.
3833         if ($operation eq '-' && @return) {
3834             $max{$addr} = $r->[-1]->end;
3835         }
3836         return @return;
3837     }
3838
3839     sub reset_each_range {  # reset the iterator for each_range();
3840         my $self = shift;
3841         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3842
3843         no overloading;
3844         undef $each_range_iterator{pack 'J', $self};
3845         return;
3846     }
3847
3848     sub each_range {
3849         # Iterate over each range in a range list.  Results are undefined if
3850         # the range list is changed during the iteration.
3851
3852         my $self = shift;
3853         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3854
3855         my $addr = do { no overloading; pack 'J', $self; };
3856
3857         return if $self->is_empty;
3858
3859         $each_range_iterator{$addr} = -1
3860                                 if ! defined $each_range_iterator{$addr};
3861         $each_range_iterator{$addr}++;
3862         return $ranges{$addr}->[$each_range_iterator{$addr}]
3863                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
3864         undef $each_range_iterator{$addr};
3865         return;
3866     }
3867
3868     sub count {        # Returns count of code points in range list
3869         my $self = shift;
3870         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3871
3872         my $addr = do { no overloading; pack 'J', $self; };
3873
3874         my $count = 0;
3875         foreach my $range (@{$ranges{$addr}}) {
3876             $count += $range->end - $range->start + 1;
3877         }
3878         return $count;
3879     }
3880
3881     sub delete_range {    # Delete a range
3882         my $self = shift;
3883         my $start = shift;
3884         my $end = shift;
3885
3886         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3887
3888         return $self->_add_delete('-', $start, $end, "");
3889     }
3890
3891     sub is_empty { # Returns boolean as to if a range list is empty
3892         my $self = shift;
3893         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3894
3895         no overloading;
3896         return scalar @{$ranges{pack 'J', $self}} == 0;
3897     }
3898
3899     sub hash {
3900         # Quickly returns a scalar suitable for separating tables into
3901         # buckets, i.e. it is a hash function of the contents of a table, so
3902         # there are relatively few conflicts.
3903
3904         my $self = shift;
3905         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3906
3907         my $addr = do { no overloading; pack 'J', $self; };
3908
3909         # These are quickly computable.  Return looks like 'min..max;count'
3910         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3911     }
3912 } # End closure for _Range_List_Base
3913
3914 package Range_List;
3915 use base '_Range_List_Base';
3916
3917 # A Range_List is a range list for match tables; i.e. the range values are
3918 # not significant.  Thus a number of operations can be safely added to it,
3919 # such as inversion, intersection.  Note that union is also an unsafe
3920 # operation when range values are cared about, and that method is in the base
3921 # class, not here.  But things are set up so that that method is callable only
3922 # during initialization.  Only in this derived class, is there an operation
3923 # that combines two tables.  A Range_Map can thus be used to initialize a
3924 # Range_List, and its mappings will be in the list, but are not significant to
3925 # this class.
3926
3927 sub trace { return main::trace(@_); }
3928
3929 { # Closure
3930
3931     use overload
3932         fallback => 0,
3933         '+' => sub { my $self = shift;
3934                     my $other = shift;
3935
3936                     return $self->_union($other)
3937                 },
3938         '&' => sub { my $self = shift;
3939                     my $other = shift;
3940
3941                     return $self->_intersect($other, 0);
3942                 },
3943         '~' => "_invert",
3944         '-' => "_subtract",
3945     ;
3946
3947     sub _invert {
3948         # Returns a new Range_List that gives all code points not in $self.
3949
3950         my $self = shift;
3951
3952         my $new = Range_List->new;
3953
3954         # Go through each range in the table, finding the gaps between them
3955         my $max = -1;   # Set so no gap before range beginning at 0
3956         for my $range ($self->ranges) {
3957             my $start = $range->start;
3958             my $end   = $range->end;
3959
3960             # If there is a gap before this range, the inverse will contain
3961             # that gap.
3962             if ($start > $max + 1) {
3963                 $new->add_range($max + 1, $start - 1);
3964             }
3965             $max = $end;
3966         }
3967
3968         # And finally, add the gap from the end of the table to the max
3969         # possible code point
3970         if ($max < $LAST_UNICODE_CODEPOINT) {
3971             $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
3972         }
3973         return $new;
3974     }
3975
3976     sub _subtract {
3977         # Returns a new Range_List with the argument deleted from it.  The
3978         # argument can be a single code point, a range, or something that has
3979         # a range, with the _range_list() method on it returning them
3980
3981         my $self = shift;
3982         my $other = shift;
3983         my $reversed = shift;
3984         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3985
3986         if ($reversed) {
3987             Carp::my_carp_bug("Can't cope with a "
3988              .  __PACKAGE__
3989              . " being the second parameter in a '-'.  Subtraction ignored.");
3990             return $self;
3991         }
3992
3993         my $new = Range_List->new(Initialize => $self);
3994
3995         if (! ref $other) { # Single code point
3996             $new->delete_range($other, $other);
3997         }
3998         elsif ($other->isa('Range')) {
3999             $new->delete_range($other->start, $other->end);
4000         }
4001         elsif ($other->can('_range_list')) {
4002             foreach my $range ($other->_range_list->ranges) {
4003                 $new->delete_range($range->start, $range->end);
4004             }
4005         }
4006         else {
4007             Carp::my_carp_bug("Can't cope with a "
4008                         . ref($other)
4009                         . " argument to '-'.  Subtraction ignored."
4010                         );
4011             return $self;
4012         }
4013
4014         return $new;
4015     }
4016
4017     sub _intersect {
4018         # Returns either a boolean giving whether the two inputs' range lists
4019         # intersect (overlap), or a new Range_List containing the intersection
4020         # of the two lists.  The optional final parameter being true indicates
4021         # to do the check instead of the intersection.
4022
4023         my $a_object = shift;
4024         my $b_object = shift;
4025         my $check_if_overlapping = shift;
4026         $check_if_overlapping = 0 unless defined $check_if_overlapping;
4027         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4028
4029         if (! defined $b_object) {
4030             my $message = "";
4031             $message .= $a_object->_owner_name_of if defined $a_object;
4032             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4033             return;
4034         }
4035
4036         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4037         # Thus the intersection could be much more simply be written:
4038         #   return ~(~$a_object + ~$b_object);
4039         # But, this is slower, and when taking the inverse of a large
4040         # range_size_1 table, back when such tables were always stored that
4041         # way, it became prohibitively slow, hence the code was changed to the
4042         # below
4043
4044         if ($b_object->isa('Range')) {
4045             $b_object = Range_List->new(Initialize => $b_object,
4046                                         Owner => $a_object->_owner_name_of);
4047         }
4048         $b_object = $b_object->_range_list if $b_object->can('_range_list');
4049
4050         my @a_ranges = $a_object->ranges;
4051         my @b_ranges = $b_object->ranges;
4052
4053         #local $to_trace = 1 if main::DEBUG;
4054         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4055
4056         # Start with the first range in each list
4057         my $a_i = 0;
4058         my $range_a = $a_ranges[$a_i];
4059         my $b_i = 0;
4060         my $range_b = $b_ranges[$b_i];
4061
4062         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4063                                                 if ! $check_if_overlapping;
4064
4065         # If either list is empty, there is no intersection and no overlap
4066         if (! defined $range_a || ! defined $range_b) {
4067             return $check_if_overlapping ? 0 : $new;
4068         }
4069         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4070
4071         # Otherwise, must calculate the intersection/overlap.  Start with the
4072         # very first code point in each list
4073         my $a = $range_a->start;
4074         my $b = $range_b->start;
4075
4076         # Loop through all the ranges of each list; in each iteration, $a and
4077         # $b are the current code points in their respective lists
4078         while (1) {
4079
4080             # If $a and $b are the same code point, ...
4081             if ($a == $b) {
4082
4083                 # it means the lists overlap.  If just checking for overlap
4084                 # know the answer now,
4085                 return 1 if $check_if_overlapping;
4086
4087                 # The intersection includes this code point plus anything else
4088                 # common to both current ranges.
4089                 my $start = $a;
4090                 my $end = main::min($range_a->end, $range_b->end);
4091                 if (! $check_if_overlapping) {
4092                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4093                     $new->add_range($start, $end);
4094                 }
4095
4096                 # Skip ahead to the end of the current intersect
4097                 $a = $b = $end;
4098
4099                 # If the current intersect ends at the end of either range (as
4100                 # it must for at least one of them), the next possible one
4101                 # will be the beginning code point in it's list's next range.
4102                 if ($a == $range_a->end) {
4103                     $range_a = $a_ranges[++$a_i];
4104                     last unless defined $range_a;
4105                     $a = $range_a->start;
4106                 }
4107                 if ($b == $range_b->end) {
4108                     $range_b = $b_ranges[++$b_i];
4109                     last unless defined $range_b;
4110                     $b = $range_b->start;
4111                 }
4112
4113                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4114             }
4115             elsif ($a < $b) {
4116
4117                 # Not equal, but if the range containing $a encompasses $b,
4118                 # change $a to be the middle of the range where it does equal
4119                 # $b, so the next iteration will get the intersection
4120                 if ($range_a->end >= $b) {
4121                     $a = $b;
4122                 }
4123                 else {
4124
4125                     # Here, the current range containing $a is entirely below
4126                     # $b.  Go try to find a range that could contain $b.
4127                     $a_i = $a_object->_search_ranges($b);
4128
4129                     # If no range found, quit.
4130                     last unless defined $a_i;
4131
4132                     # The search returns $a_i, such that
4133                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4134                     # Set $a to the beginning of this new range, and repeat.
4135                     $range_a = $a_ranges[$a_i];
4136                     $a = $range_a->start;
4137                 }
4138             }
4139             else { # Here, $b < $a.
4140
4141                 # Mirror image code to the leg just above
4142                 if ($range_b->end >= $a) {
4143                     $b = $a;
4144                 }
4145                 else {
4146                     $b_i = $b_object->_search_ranges($a);
4147                     last unless defined $b_i;
4148                     $range_b = $b_ranges[$b_i];
4149                     $b = $range_b->start;
4150                 }
4151             }
4152         } # End of looping through ranges.
4153
4154         # Intersection fully computed, or now know that there is no overlap
4155         return $check_if_overlapping ? 0 : $new;
4156     }
4157
4158     sub overlaps {
4159         # Returns boolean giving whether the two arguments overlap somewhere
4160
4161         my $self = shift;
4162         my $other = shift;
4163         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4164
4165         return $self->_intersect($other, 1);
4166     }
4167
4168     sub add_range {
4169         # Add a range to the list.
4170
4171         my $self = shift;
4172         my $start = shift;
4173         my $end = shift;
4174         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4175
4176         return $self->_add_delete('+', $start, $end, "");
4177     }
4178
4179     sub matches_identically_to {
4180         # Return a boolean as to whether or not two Range_Lists match identical
4181         # sets of code points.
4182
4183         my $self = shift;
4184         my $other = shift;
4185         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4186
4187         # These are ordered in increasing real time to figure out (at least
4188         # until a patch changes that and doesn't change this)
4189         return 0 if $self->max != $other->max;
4190         return 0 if $self->min != $other->min;
4191         return 0 if $self->range_count != $other->range_count;
4192         return 0 if $self->count != $other->count;
4193
4194         # Here they could be identical because all the tests above passed.
4195         # The loop below is somewhat simpler since we know they have the same
4196         # number of elements.  Compare range by range, until reach the end or
4197         # find something that differs.
4198         my @a_ranges = $self->ranges;
4199         my @b_ranges = $other->ranges;
4200         for my $i (0 .. @a_ranges - 1) {
4201             my $a = $a_ranges[$i];
4202             my $b = $b_ranges[$i];
4203             trace "self $a; other $b" if main::DEBUG && $to_trace;
4204             return 0 if $a->start != $b->start || $a->end != $b->end;
4205         }
4206         return 1;
4207     }
4208
4209     sub is_code_point_usable {
4210         # This used only for making the test script.  See if the input
4211         # proposed trial code point is one that Perl will handle.  If second
4212         # parameter is 0, it won't select some code points for various
4213         # reasons, noted below.
4214
4215         my $code = shift;
4216         my $try_hard = shift;
4217         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4218
4219         return 0 if $code < 0;                # Never use a negative
4220
4221         # shun null.  I'm (khw) not sure why this was done, but NULL would be
4222         # the character very frequently used.
4223         return $try_hard if $code == 0x0000;
4224
4225         # shun non-character code points.
4226         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4227         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4228
4229         return $try_hard if $code > $LAST_UNICODE_CODEPOINT;   # keep in range
4230         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4231
4232         return 1;
4233     }
4234
4235     sub get_valid_code_point {
4236         # Return a code point that's part of the range list.  Returns nothing
4237         # if the table is empty or we can't find a suitable code point.  This
4238         # used only for making the test script.
4239
4240         my $self = shift;
4241         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4242
4243         my $addr = do { no overloading; pack 'J', $self; };
4244
4245         # On first pass, don't choose less desirable code points; if no good
4246         # one is found, repeat, allowing a less desirable one to be selected.
4247         for my $try_hard (0, 1) {
4248
4249             # Look through all the ranges for a usable code point.
4250             for my $set ($self->ranges) {
4251
4252                 # Try the edge cases first, starting with the end point of the
4253                 # range.
4254                 my $end = $set->end;
4255                 return $end if is_code_point_usable($end, $try_hard);
4256
4257                 # End point didn't, work.  Start at the beginning and try
4258                 # every one until find one that does work.
4259                 for my $trial ($set->start .. $end - 1) {
4260                     return $trial if is_code_point_usable($trial, $try_hard);
4261                 }
4262             }
4263         }
4264         return ();  # If none found, give up.
4265     }
4266
4267     sub get_invalid_code_point {
4268         # Return a code point that's not part of the table.  Returns nothing
4269         # if the table covers all code points or a suitable code point can't
4270         # be found.  This used only for making the test script.
4271
4272         my $self = shift;
4273         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4274
4275         # Just find a valid code point of the inverse, if any.
4276         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4277     }
4278 } # end closure for Range_List
4279
4280 package Range_Map;
4281 use base '_Range_List_Base';
4282
4283 # A Range_Map is a range list in which the range values (called maps) are
4284 # significant, and hence shouldn't be manipulated by our other code, which
4285 # could be ambiguous or lose things.  For example, in taking the union of two
4286 # lists, which share code points, but which have differing values, which one
4287 # has precedence in the union?
4288 # It turns out that these operations aren't really necessary for map tables,
4289 # and so this class was created to make sure they aren't accidentally
4290 # applied to them.
4291
4292 { # Closure
4293
4294     sub add_map {
4295         # Add a range containing a mapping value to the list
4296
4297         my $self = shift;
4298         # Rest of parameters passed on
4299
4300         return $self->_add_delete('+', @_);
4301     }
4302
4303     sub add_duplicate {
4304         # Adds entry to a range list which can duplicate an existing entry
4305
4306         my $self = shift;
4307         my $code_point = shift;
4308         my $value = shift;
4309         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4310
4311         return $self->add_map($code_point, $code_point,
4312                                 $value, Replace => $MULTIPLE);
4313     }
4314 } # End of closure for package Range_Map
4315
4316 package _Base_Table;
4317
4318 # A table is the basic data structure that gets written out into a file for
4319 # use by the Perl core.  This is the abstract base class implementing the
4320 # common elements from the derived ones.  A list of the methods to be
4321 # furnished by an implementing class is just after the constructor.
4322
4323 sub standardize { return main::standardize($_[0]); }
4324 sub trace { return main::trace(@_); }
4325
4326 { # Closure
4327
4328     main::setup_package();
4329
4330     my %range_list;
4331     # Object containing the ranges of the table.
4332     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4333
4334     my %full_name;
4335     # The full table name.
4336     main::set_access('full_name', \%full_name, 'r');
4337
4338     my %name;
4339     # The table name, almost always shorter
4340     main::set_access('name', \%name, 'r');
4341
4342     my %short_name;
4343     # The shortest of all the aliases for this table, with underscores removed
4344     main::set_access('short_name', \%short_name);
4345
4346     my %nominal_short_name_length;
4347     # The length of short_name before removing underscores
4348     main::set_access('nominal_short_name_length',
4349                     \%nominal_short_name_length);
4350
4351     my %complete_name;
4352     # The complete name, including property.
4353     main::set_access('complete_name', \%complete_name, 'r');
4354
4355     my %property;
4356     # Parent property this table is attached to.
4357     main::set_access('property', \%property, 'r');
4358
4359     my %aliases;
4360     # Ordered list of aliases of the table's name.  The first ones in the list
4361     # are output first in comments
4362     main::set_access('aliases', \%aliases, 'readable_array');
4363
4364     my %comment;
4365     # A comment associated with the table for human readers of the files
4366     main::set_access('comment', \%comment, 's');
4367
4368     my %description;
4369     # A comment giving a short description of the table's meaning for human
4370     # readers of the files.
4371     main::set_access('description', \%description, 'readable_array');
4372
4373     my %note;
4374     # A comment giving a short note about the table for human readers of the
4375     # files.
4376     main::set_access('note', \%note, 'readable_array');
4377
4378     my %internal_only;
4379     # Boolean; if set means any file that contains this table is marked as for
4380     # internal-only use.
4381     main::set_access('internal_only', \%internal_only);
4382
4383     my %find_table_from_alias;
4384     # The parent property passes this pointer to a hash which this class adds
4385     # all its aliases to, so that the parent can quickly take an alias and
4386     # find this table.
4387     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4388
4389     my %locked;
4390     # After this table is made equivalent to another one; we shouldn't go
4391     # changing the contents because that could mean it's no longer equivalent
4392     main::set_access('locked', \%locked, 'r');
4393
4394     my %file_path;
4395     # This gives the final path to the file containing the table.  Each
4396     # directory in the path is an element in the array
4397     main::set_access('file_path', \%file_path, 'readable_array');
4398
4399     my %status;
4400     # What is the table's status, normal, $OBSOLETE, etc.  Enum
4401     main::set_access('status', \%status, 'r');
4402
4403     my %status_info;
4404     # A comment about its being obsolete, or whatever non normal status it has
4405     main::set_access('status_info', \%status_info, 'r');
4406
4407     my %range_size_1;
4408     # Is the table to be output with each range only a single code point?
4409     # This is done to avoid breaking existing code that may have come to rely
4410     # on this behavior in previous versions of this program.)
4411     main::set_access('range_size_1', \%range_size_1, 'r', 's');
4412
4413     my %perl_extension;
4414     # A boolean set iff this table is a Perl extension to the Unicode
4415     # standard.
4416     main::set_access('perl_extension', \%perl_extension, 'r');
4417
4418     my %output_range_counts;
4419     # A boolean set iff this table is to have comments written in the
4420     # output file that contain the number of code points in the range.
4421     # The constructor can override the global flag of the same name.
4422     main::set_access('output_range_counts', \%output_range_counts, 'r');
4423
4424     my %format;
4425     # The format of the entries of the table.  This is calculated from the
4426     # data in the table (or passed in the constructor).  This is an enum e.g.,
4427     # $STRING_FORMAT
4428     main::set_access('format', \%format, 'r', 'p_s');
4429
4430     sub new {
4431         # All arguments are key => value pairs, which you can see below, most
4432         # of which match fields documented above.  Otherwise: Pod_Entry,
4433         # Externally_Ok, and Fuzzy apply to the names of the table, and are
4434         # documented in the Alias package
4435
4436         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4437
4438         my $class = shift;
4439
4440         my $self = bless \do { my $anonymous_scalar }, $class;
4441         my $addr = do { no overloading; pack 'J', $self; };
4442
4443         my %args = @_;
4444
4445         $name{$addr} = delete $args{'Name'};
4446         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4447         $full_name{$addr} = delete $args{'Full_Name'};
4448         my $complete_name = $complete_name{$addr}
4449                           = delete $args{'Complete_Name'};
4450         $format{$addr} = delete $args{'Format'};
4451         $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
4452         $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4453         $property{$addr} = delete $args{'_Property'};
4454         $range_list{$addr} = delete $args{'_Range_List'};
4455         $status{$addr} = delete $args{'Status'} || $NORMAL;
4456         $status_info{$addr} = delete $args{'_Status_Info'} || "";
4457         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4458
4459         my $description = delete $args{'Description'};
4460         my $externally_ok = delete $args{'Externally_Ok'};
4461         my $loose_match = delete $args{'Fuzzy'};
4462         my $note = delete $args{'Note'};
4463         my $make_pod_entry = delete $args{'Pod_Entry'};
4464         my $perl_extension = delete $args{'Perl_Extension'};
4465
4466         # Shouldn't have any left over
4467         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4468
4469         # Can't use || above because conceivably the name could be 0, and
4470         # can't use // operator in case this program gets used in Perl 5.8
4471         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4472         $output_range_counts{$addr} = $output_range_counts if
4473                                         ! defined $output_range_counts{$addr};
4474
4475         $aliases{$addr} = [ ];
4476         $comment{$addr} = [ ];
4477         $description{$addr} = [ ];
4478         $note{$addr} = [ ];
4479         $file_path{$addr} = [ ];
4480         $locked{$addr} = "";
4481
4482         push @{$description{$addr}}, $description if $description;
4483         push @{$note{$addr}}, $note if $note;
4484
4485         if ($status{$addr} eq $PLACEHOLDER) {
4486
4487             # A placeholder table doesn't get documented, is a perl extension,
4488             # and quite likely will be empty
4489             $make_pod_entry = 0 if ! defined $make_pod_entry;
4490             $perl_extension = 1 if ! defined $perl_extension;
4491             push @tables_that_may_be_empty, $complete_name{$addr};
4492         }
4493         elsif (! $status{$addr}) {
4494
4495             # If hasn't set its status already, see if it is on one of the
4496             # lists of properties or tables that have particular statuses; if
4497             # not, is normal.  The lists are prioritized so the most serious
4498             # ones are checked first
4499             if (exists $why_suppressed{$complete_name}
4500                 # Don't suppress if overridden
4501                 && ! grep { $_ eq $complete_name{$addr} }
4502                                                     @output_mapped_properties)
4503             {
4504                 $status{$addr} = $SUPPRESSED;
4505             }
4506             elsif (exists $why_deprecated{$complete_name}) {
4507                 $status{$addr} = $DEPRECATED;
4508             }
4509             elsif (exists $why_stabilized{$complete_name}) {
4510                 $status{$addr} = $STABILIZED;
4511             }
4512             elsif (exists $why_obsolete{$complete_name}) {
4513                 $status{$addr} = $OBSOLETE;
4514             }
4515
4516             # Existence above doesn't necessarily mean there is a message
4517             # associated with it.  Use the most serious message.
4518             if ($status{$addr}) {
4519                 if ($why_suppressed{$complete_name}) {
4520                     $status_info{$addr}
4521                                 = $why_suppressed{$complete_name};
4522                 }
4523                 elsif ($why_deprecated{$complete_name}) {
4524                     $status_info{$addr}
4525                                 = $why_deprecated{$complete_name};
4526                 }
4527                 elsif ($why_stabilized{$complete_name}) {
4528                     $status_info{$addr}
4529                                 = $why_stabilized{$complete_name};
4530                 }
4531                 elsif ($why_obsolete{$complete_name}) {
4532                     $status_info{$addr}
4533                                 = $why_obsolete{$complete_name};
4534                 }
4535             }
4536         }
4537
4538         $perl_extension{$addr} = $perl_extension || 0;
4539
4540         # By convention what typically gets printed only or first is what's
4541         # first in the list, so put the full name there for good output
4542         # clarity.  Other routines rely on the full name being first on the
4543         # list
4544         $self->add_alias($full_name{$addr},
4545                             Externally_Ok => $externally_ok,
4546                             Fuzzy => $loose_match,
4547                             Pod_Entry => $make_pod_entry,
4548                             Status => $status{$addr},
4549                             );
4550
4551         # Then comes the other name, if meaningfully different.
4552         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4553             $self->add_alias($name{$addr},
4554                             Externally_Ok => $externally_ok,
4555                             Fuzzy => $loose_match,
4556                             Pod_Entry => $make_pod_entry,
4557                             Status => $status{$addr},
4558                             );
4559         }
4560
4561         return $self;
4562     }
4563
4564     # Here are the methods that are required to be defined by any derived
4565     # class
4566     for my $sub (qw(
4567                     handle_special_range
4568                     append_to_body
4569                     pre_body
4570                 ))
4571                 # write() knows how to write out normal ranges, but it calls
4572                 # handle_special_range() when it encounters a non-normal one.
4573                 # append_to_body() is called by it after it has handled all
4574                 # ranges to add anything after the main portion of the table.
4575                 # And finally, pre_body() is called after all this to build up
4576                 # anything that should appear before the main portion of the
4577                 # table.  Doing it this way allows things in the middle to
4578                 # affect what should appear before the main portion of the
4579                 # table.
4580     {
4581         no strict "refs";
4582         *$sub = sub {
4583             Carp::my_carp_bug( __LINE__
4584                               . ": Must create method '$sub()' for "
4585                               . ref shift);
4586             return;
4587         }
4588     }
4589
4590     use overload
4591         fallback => 0,
4592         "." => \&main::_operator_dot,
4593         '!=' => \&main::_operator_not_equal,
4594         '==' => \&main::_operator_equal,
4595     ;
4596
4597     sub ranges {
4598         # Returns the array of ranges associated with this table.
4599
4600         no overloading;
4601         return $range_list{pack 'J', shift}->ranges;
4602     }
4603
4604     sub add_alias {
4605         # Add a synonym for this table.
4606
4607         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4608
4609         my $self = shift;
4610         my $name = shift;       # The name to add.
4611         my $pointer = shift;    # What the alias hash should point to.  For
4612                                 # map tables, this is the parent property;
4613                                 # for match tables, it is the table itself.
4614
4615         my %args = @_;
4616         my $loose_match = delete $args{'Fuzzy'};
4617
4618         my $make_pod_entry = delete $args{'Pod_Entry'};
4619         $make_pod_entry = $YES unless defined $make_pod_entry;
4620
4621         my $externally_ok = delete $args{'Externally_Ok'};
4622         $externally_ok = 1 unless defined $externally_ok;
4623
4624         my $status = delete $args{'Status'};
4625         $status = $NORMAL unless defined $status;
4626
4627         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4628
4629         # Capitalize the first letter of the alias unless it is one of the CJK
4630         # ones which specifically begins with a lower 'k'.  Do this because
4631         # Unicode has varied whether they capitalize first letters or not, and
4632         # have later changed their minds and capitalized them, but not the
4633         # other way around.  So do it always and avoid changes from release to
4634         # release
4635         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4636
4637         my $addr = do { no overloading; pack 'J', $self; };
4638
4639         # Figure out if should be loosely matched if not already specified.
4640         if (! defined $loose_match) {
4641
4642             # Is a loose_match if isn't null, and doesn't begin with an
4643             # underscore and isn't just a number
4644             if ($name ne ""
4645                 && substr($name, 0, 1) ne '_'
4646                 && $name !~ qr{^[0-9_.+-/]+$})
4647             {
4648                 $loose_match = 1;
4649             }
4650             else {
4651                 $loose_match = 0;
4652             }
4653         }
4654
4655         # If this alias has already been defined, do nothing.
4656         return if defined $find_table_from_alias{$addr}->{$name};
4657
4658         # That includes if it is standardly equivalent to an existing alias,
4659         # in which case, add this name to the list, so won't have to search
4660         # for it again.
4661         my $standard_name = main::standardize($name);
4662         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4663             $find_table_from_alias{$addr}->{$name}
4664                         = $find_table_from_alias{$addr}->{$standard_name};
4665             return;
4666         }
4667
4668         # Set the index hash for this alias for future quick reference.
4669         $find_table_from_alias{$addr}->{$name} = $pointer;
4670         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4671         local $to_trace = 0 if main::DEBUG;
4672         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4673         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4674
4675
4676         # Put the new alias at the end of the list of aliases unless the final
4677         # element begins with an underscore (meaning it is for internal perl
4678         # use) or is all numeric, in which case, put the new one before that
4679         # one.  This floats any all-numeric or underscore-beginning aliases to
4680         # the end.  This is done so that they are listed last in output lists,
4681         # to encourage the user to use a better name (either more descriptive
4682         # or not an internal-only one) instead.  This ordering is relied on
4683         # implicitly elsewhere in this program, like in short_name()
4684         my $list = $aliases{$addr};
4685         my $insert_position = (@$list == 0
4686                                 || (substr($list->[-1]->name, 0, 1) ne '_'
4687                                     && $list->[-1]->name =~ /\D/))
4688                             ? @$list
4689                             : @$list - 1;
4690         splice @$list,
4691                 $insert_position,
4692                 0,
4693                 Alias->new($name, $loose_match, $make_pod_entry,
4694                                                     $externally_ok, $status);
4695
4696         # This name may be shorter than any existing ones, so clear the cache
4697         # of the shortest, so will have to be recalculated.
4698         no overloading;
4699         undef $short_name{pack 'J', $self};
4700         return;
4701     }
4702
4703     sub short_name {
4704         # Returns a name suitable for use as the base part of a file name.
4705         # That is, shorter wins.  It can return undef if there is no suitable
4706         # name.  The name has all non-essential underscores removed.
4707
4708         # The optional second parameter is a reference to a scalar in which
4709         # this routine will store the length the returned name had before the
4710         # underscores were removed, or undef if the return is undef.
4711
4712         # The shortest name can change if new aliases are added.  So using
4713         # this should be deferred until after all these are added.  The code
4714         # that does that should clear this one's cache.
4715         # Any name with alphabetics is preferred over an all numeric one, even
4716         # if longer.
4717
4718         my $self = shift;
4719         my $nominal_length_ptr = shift;
4720         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4721
4722         my $addr = do { no overloading; pack 'J', $self; };
4723
4724         # For efficiency, don't recalculate, but this means that adding new
4725         # aliases could change what the shortest is, so the code that does
4726         # that needs to undef this.
4727         if (defined $short_name{$addr}) {
4728             if ($nominal_length_ptr) {
4729                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4730             }
4731             return $short_name{$addr};
4732         }
4733
4734         # Look at each alias
4735         foreach my $alias ($self->aliases()) {
4736
4737             # Don't use an alias that isn't ok to use for an external name.
4738             next if ! $alias->externally_ok;
4739
4740             my $name = main::Standardize($alias->name);
4741             trace $self, $name if main::DEBUG && $to_trace;
4742
4743             # Take the first one, or a shorter one that isn't numeric.  This
4744             # relies on numeric aliases always being last in the array
4745             # returned by aliases().  Any alpha one will have precedence.
4746             if (! defined $short_name{$addr}
4747                 || ($name =~ /\D/
4748                     && length($name) < length($short_name{$addr})))
4749             {
4750                 # Remove interior underscores.
4751                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4752
4753                 $nominal_short_name_length{$addr} = length $name;
4754             }
4755         }
4756
4757         # If no suitable external name return undef
4758         if (! defined $short_name{$addr}) {
4759             $$nominal_length_ptr = undef if $nominal_length_ptr;
4760             return;
4761         }
4762
4763         # Don't allow a null external name.
4764         if ($short_name{$addr} eq "") {
4765             $short_name{$addr} = '_';
4766             $nominal_short_name_length{$addr} = 1;
4767         }
4768
4769         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4770
4771         if ($nominal_length_ptr) {
4772             $$nominal_length_ptr = $nominal_short_name_length{$addr};
4773         }
4774         return $short_name{$addr};
4775     }
4776
4777     sub external_name {
4778         # Returns the external name that this table should be known by.  This
4779         # is usually the short_name, but not if the short_name is undefined.
4780
4781         my $self = shift;
4782         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4783
4784         my $short = $self->short_name;
4785         return $short if defined $short;
4786
4787         return '_';
4788     }
4789
4790     sub add_description { # Adds the parameter as a short description.
4791
4792         my $self = shift;
4793         my $description = shift;
4794         chomp $description;
4795         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4796
4797         no overloading;
4798         push @{$description{pack 'J', $self}}, $description;
4799
4800         return;
4801     }
4802
4803     sub add_note { # Adds the parameter as a short note.
4804
4805         my $self = shift;
4806         my $note = shift;
4807         chomp $note;
4808         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4809
4810         no overloading;
4811         push @{$note{pack 'J', $self}}, $note;
4812
4813         return;
4814     }
4815
4816     sub add_comment { # Adds the parameter as a comment.
4817
4818         return unless $debugging_build;
4819
4820         my $self = shift;
4821         my $comment = shift;
4822         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4823
4824         chomp $comment;
4825
4826         no overloading;
4827         push @{$comment{pack 'J', $self}}, $comment;
4828
4829         return;
4830     }
4831
4832     sub comment {
4833         # Return the current comment for this table.  If called in list
4834         # context, returns the array of comments.  In scalar, returns a string
4835         # of each element joined together with a period ending each.
4836
4837         my $self = shift;
4838         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4839
4840         my $addr = do { no overloading; pack 'J', $self; };
4841         my @list = @{$comment{$addr}};
4842         return @list if wantarray;
4843         my $return = "";
4844         foreach my $sentence (@list) {
4845             $return .= '.  ' if $return;
4846             $return .= $sentence;
4847             $return =~ s/\.$//;
4848         }
4849         $return .= '.' if $return;
4850         return $return;
4851     }
4852
4853     sub initialize {
4854         # Initialize the table with the argument which is any valid
4855         # initialization for range lists.
4856
4857         my $self = shift;
4858         my $addr = do { no overloading; pack 'J', $self; };
4859         my $initialization = shift;
4860         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4861
4862         # Replace the current range list with a new one of the same exact
4863         # type.
4864         my $class = ref $range_list{$addr};
4865         $range_list{$addr} = $class->new(Owner => $self,
4866                                         Initialize => $initialization);
4867         return;
4868
4869     }
4870
4871     sub header {
4872         # The header that is output for the table in the file it is written
4873         # in.
4874
4875         my $self = shift;
4876         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4877
4878         my $return = "";
4879         $return .= $DEVELOPMENT_ONLY if $compare_versions;
4880         $return .= $HEADER;
4881         no overloading;
4882         $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self};
4883         return $return;
4884     }
4885
4886     sub write {
4887         # Write a representation of the table to its file.  It calls several
4888         # functions furnished by sub-classes of this abstract base class to
4889         # handle non-normal ranges, to add stuff before the table, and at its
4890         # end.
4891
4892         my $self = shift;
4893         my $tab_stops = shift;       # The number of tab stops over to put any
4894                                      # comment.
4895         my $suppress_value = shift;  # Optional, if the value associated with
4896                                      # a range equals this one, don't write
4897                                      # the range
4898         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4899
4900         my $addr = do { no overloading; pack 'J', $self; };
4901
4902         # Start with the header
4903         my @HEADER = $self->header;
4904
4905         # Then the comments
4906         push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
4907                                                         if $comment{$addr};
4908
4909         # Things discovered processing the main body of the document may
4910         # affect what gets output before it, therefore pre_body() isn't called
4911         # until after all other processing of the table is done.
4912
4913         # The main body looks like a 'here' document.  If annotating, get rid
4914         # of the comments before passing to the caller, as some callers, such
4915         # as charnames.pm, can't cope with them.  (Outputting range counts
4916         # also introduces comments, but these don't show up in the tables that
4917         # can't cope with comments, and there aren't that many of them that
4918         # it's worth the extra real time to get rid of them).
4919         my @OUT;
4920         if ($annotate) {
4921             # Use the line below in Perls that don't have /r
4922             #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
4923             push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
4924         } else {
4925             push @OUT, "return <<'END';\n";
4926         }
4927
4928         if ($range_list{$addr}->is_empty) {
4929
4930             # This is a kludge for empty tables to silence a warning in
4931             # utf8.c, which can't really deal with empty tables, but it can
4932             # deal with a table that matches nothing, as the inverse of 'Any'
4933             # does.
4934             push @OUT, "!utf8::IsAny\n";
4935         }
4936         else {
4937             my $range_size_1 = $range_size_1{$addr};
4938             my $format;            # Used only in $annotate option
4939             my $include_name;      # Used only in $annotate option
4940
4941             if ($annotate) {
4942
4943                 # if annotating each code point, must print 1 per line.
4944                 # The variable could point to a subroutine, and we don't want
4945                 # to lose that fact, so only set if not set already
4946                 $range_size_1 = 1 if ! $range_size_1;
4947
4948                 $format = $self->format;
4949
4950                 # The name of the character is output only for tables that
4951                 # don't already include the name in the output.
4952                 my $property = $self->property;
4953                 $include_name =
4954                     !  ($property == $perl_charname
4955                         || $property == main::property_ref('Unicode_1_Name')
4956                         || $property == main::property_ref('Name')
4957                         || $property == main::property_ref('Name_Alias')
4958                        );
4959             }
4960
4961             # Output each range as part of the here document.
4962             RANGE:
4963             for my $set ($range_list{$addr}->ranges) {
4964                 if ($set->type != 0) {
4965                     $self->handle_special_range($set);
4966                     next RANGE;
4967                 }
4968                 my $start = $set->start;
4969                 my $end   = $set->end;
4970                 my $value  = $set->value;
4971
4972                 # Don't output ranges whose value is the one to suppress
4973                 next RANGE if defined $suppress_value
4974                               && $value eq $suppress_value;
4975
4976                 # If there is a range and doesn't need a single point range
4977                 # output
4978                 if ($start != $end && ! $range_size_1) {
4979                     push @OUT, sprintf "%04X\t%04X", $start, $end;
4980                     $OUT[-1] .= "\t$value" if $value ne "";
4981
4982                     # Add a comment with the size of the range, if requested.
4983                     # Expand Tabs to make sure they all start in the same
4984                     # column, and then unexpand to use mostly tabs.
4985                     if (! $output_range_counts{$addr}) {
4986                         $OUT[-1] .= "\n";
4987                     }
4988                     else {
4989                         $OUT[-1] = Text::Tabs::expand($OUT[-1]);
4990                         my $count = main::clarify_number($end - $start + 1);
4991                         use integer;
4992
4993                         my $width = $tab_stops * 8 - 1;
4994                         $OUT[-1] = sprintf("%-*s # [%s]\n",
4995                                             $width,
4996                                             $OUT[-1],
4997                                             $count);
4998                         $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
4999                     }
5000                     next RANGE;
5001                 }
5002
5003                 # Here to output a single code point per line
5004
5005                 # If not to annotate, use the simple formats
5006                 if (! $annotate) {
5007
5008                     # Use any passed in subroutine to output.
5009                     if (ref $range_size_1 eq 'CODE') {
5010                         for my $i ($start .. $end) {
5011                             push @OUT, &{$range_size_1}($i, $value);
5012                         }
5013                     }
5014                     else {
5015
5016                         # Here, caller is ok with default output.
5017                         for (my $i = $start; $i <= $end; $i++) {
5018                             push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5019                         }
5020                     }
5021                     next RANGE;
5022                 }
5023
5024                 # Here, wants annotation.
5025                 for (my $i = $start; $i <= $end; $i++) {
5026
5027                     # Get character information if don't have it already
5028                     main::populate_char_info($i)
5029                                         if ! defined $viacode[$i];
5030                     my $type = $annotate_char_type[$i];
5031
5032                     # Figure out if should output the next code points as part
5033                     # of a range or not.  If this is not in an annotation
5034                     # range, then won't output as a range, so returns $i.
5035                     # Otherwise use the end of the annotation range, but no
5036                     # further than the maximum possible end point of the loop.
5037                     my $range_end = main::min($annotate_ranges->value_of($i)
5038                                                                         || $i,
5039                                                $end);
5040
5041                     # Use a range if it is a range, and either is one of the
5042                     # special annotation ranges, or the range is at most 3
5043                     # long.  This last case causes the algorithmically named
5044                     # code points to be output individually in spans of at
5045                     # most 3, as they are the ones whose $type is > 0.
5046                     if ($range_end != $i
5047                         && ( $type < 0 || $range_end - $i > 2))
5048                     {
5049                         # Here is to output a range.  We don't allow a
5050                         # caller-specified output format--just use the
5051                         # standard one.
5052                         push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5053                                                                 $range_end,
5054                                                                 $value;
5055                         my $range_name = $viacode[$i];
5056
5057                         # For the code points which end in their hex value, we
5058                         # eliminate that from the output annotation, and
5059                         # capitalize only the first letter of each word.
5060                         if ($type == $CP_IN_NAME) {
5061                             my $hex = sprintf "%04X", $i;
5062                             $range_name =~ s/-$hex$//;
5063                             my @words = split " ", $range_name;
5064                             for my $word (@words) {
5065                                 $word = ucfirst(lc($word)) if $word ne 'CJK';
5066                             }
5067                             $range_name = join " ", @words;
5068                         }
5069                         elsif ($type == $HANGUL_SYLLABLE) {
5070                             $range_name = "Hangul Syllable";
5071                         }
5072
5073                         $OUT[-1] .= " $range_name" if $range_name;
5074
5075                         # Include the number of code points in the range
5076                         my $count = main::clarify_number($range_end - $i + 1);
5077                         $OUT[-1] .= " [$count]\n";
5078
5079                         # Skip to the end of the range
5080                         $i = $range_end;
5081                     }
5082                     else { # Not in a range.
5083                         my $comment = "";
5084
5085                         # When outputting the names of each character, use
5086                         # the character itself if printable
5087                         $comment .= "'" . chr($i) . "' " if $printable[$i];
5088
5089                         # To make it more readable, use a minimum indentation
5090                         my $comment_indent;
5091
5092                         # Determine the annotation
5093                         if ($format eq $DECOMP_STRING_FORMAT) {
5094
5095                             # This is very specialized, with the type of
5096                             # decomposition beginning the line enclosed in
5097                             # <...>, and the code points that the code point
5098                             # decomposes to separated by blanks.  Create two
5099                             # strings, one of the printable characters, and
5100                             # one of their official names.
5101                             (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5102                             my $tostr = "";
5103                             my $to_name = "";
5104                             my $to_chr = "";
5105                             foreach my $to (split " ", $map) {
5106                                 $to = CORE::hex $to;
5107                                 $to_name .= " + " if $to_name;
5108                                 $to_chr .= chr($to);
5109                                 main::populate_char_info($to)
5110                                                     if ! defined $viacode[$to];
5111                                 $to_name .=  $viacode[$to];
5112                             }
5113
5114                             $comment .=
5115                                     "=> '$to_chr'; $viacode[$i] => $to_name";
5116                             $comment_indent = 25;   # Determined by experiment
5117                         }
5118                         else {
5119
5120                             # Assume that any table that has hex format is a
5121                             # mapping of one code point to another.
5122                             if ($format eq $HEX_FORMAT) {
5123                                 my $decimal_value = CORE::hex $value;
5124                                 main::populate_char_info($decimal_value)
5125                                         if ! defined $viacode[$decimal_value];
5126                                 $comment .= "=> '"
5127                                          . chr($decimal_value)
5128                                          . "'; " if $printable[$decimal_value];
5129                             }
5130                             $comment .= $viacode[$i] if $include_name
5131                                                         && $viacode[$i];
5132                             if ($format eq $HEX_FORMAT) {
5133                                 my $decimal_value = CORE::hex $value;
5134                                 $comment .= " => $viacode[$decimal_value]"
5135                                                     if $viacode[$decimal_value];
5136                             }
5137
5138                             # If including the name, no need to indent, as the
5139                             # name will already be way across the line.
5140                             $comment_indent = ($include_name) ? 0 : 60;
5141                         }
5142
5143                         # Use any passed in routine to output the base part of
5144                         # the line.
5145                         if (ref $range_size_1 eq 'CODE') {
5146                             my $base_part = &{$range_size_1}($i, $value);
5147                             chomp $base_part;
5148                             push @OUT, $base_part;
5149                         }
5150                         else {
5151                             push @OUT, sprintf "%04X\t\t%s", $i, $value;
5152                         }
5153
5154                         # And add the annotation.
5155                         $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
5156                                                          $OUT[-1],
5157                                                          $comment if $comment;
5158                         $OUT[-1] .= "\n";
5159                     }
5160                 }
5161             } # End of loop through all the table's ranges
5162         }
5163
5164         # Add anything that goes after the main body, but within the here
5165         # document,
5166         my $append_to_body = $self->append_to_body;
5167         push @OUT, $append_to_body if $append_to_body;
5168
5169         # And finish the here document.
5170         push @OUT, "END\n";
5171
5172         # Done with the main portion of the body.  Can now figure out what
5173         # should appear before it in the file.
5174         my $pre_body = $self->pre_body;
5175         push @HEADER, $pre_body, "\n" if $pre_body;
5176
5177         # All these files have a .pl suffix
5178         $file_path{$addr}->[-1] .= '.pl';
5179
5180         main::write($file_path{$addr},
5181                     $annotate,      # utf8 iff annotating
5182                     \@HEADER,
5183                     \@OUT);
5184         return;
5185     }
5186
5187     sub set_status {    # Set the table's status
5188         my $self = shift;
5189         my $status = shift; # The status enum value
5190         my $info = shift;   # Any message associated with it.
5191         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5192
5193         my $addr = do { no overloading; pack 'J', $self; };
5194
5195         $status{$addr} = $status;
5196         $status_info{$addr} = $info;
5197         return;
5198     }
5199
5200     sub lock {
5201         # Don't allow changes to the table from now on.  This stores a stack
5202         # trace of where it was called, so that later attempts to modify it
5203         # can immediately show where it got locked.
5204
5205         my $self = shift;
5206         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5207
5208         my $addr = do { no overloading; pack 'J', $self; };
5209
5210         $locked{$addr} = "";
5211
5212         my $line = (caller(0))[2];
5213         my $i = 1;
5214
5215         # Accumulate the stack trace
5216         while (1) {
5217             my ($pkg, $file, $caller_line, $caller) = caller $i++;
5218
5219             last unless defined $caller;
5220
5221             $locked{$addr} .= "    called from $caller() at line $line\n";
5222             $line = $caller_line;
5223         }
5224         $locked{$addr} .= "    called from main at line $line\n";
5225
5226         return;
5227     }
5228
5229     sub carp_if_locked {
5230         # Return whether a table is locked or not, and, by the way, complain
5231         # if is locked
5232
5233         my $self = shift;
5234         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5235
5236         my $addr = do { no overloading; pack 'J', $self; };
5237
5238         return 0 if ! $locked{$addr};
5239         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5240         return 1;
5241     }
5242
5243     sub set_file_path { # Set the final directory path for this table
5244         my $self = shift;
5245         # Rest of parameters passed on
5246
5247         no overloading;
5248         @{$file_path{pack 'J', $self}} = @_;
5249         return
5250     }
5251
5252     # Accessors for the range list stored in this table.  First for
5253     # unconditional
5254     for my $sub (qw(
5255                     containing_range
5256                     contains
5257                     count
5258                     each_range
5259                     hash
5260                     is_empty
5261                     matches_identically_to
5262                     max
5263                     min
5264                     range_count
5265                     reset_each_range
5266                     type_of
5267                     value_of
5268                 ))
5269     {
5270         no strict "refs";
5271         *$sub = sub {
5272             use strict "refs";
5273             my $self = shift;
5274             no overloading;
5275             return $range_list{pack 'J', $self}->$sub(@_);
5276         }
5277     }
5278
5279     # Then for ones that should fail if locked
5280     for my $sub (qw(
5281                     delete_range
5282                 ))
5283     {
5284         no strict "refs";
5285         *$sub = sub {
5286             use strict "refs";
5287             my $self = shift;
5288
5289             return if $self->carp_if_locked;
5290             no overloading;
5291             return $range_list{pack 'J', $self}->$sub(@_);
5292         }
5293     }
5294
5295 } # End closure
5296
5297 package Map_Table;
5298 use base '_Base_Table';
5299
5300 # A Map Table is a table that contains the mappings from code points to
5301 # values.  There are two weird cases:
5302 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5303 #    are written in the table's file at the end of the table nonetheless.  It
5304 #    requires specially constructed code to handle these; utf8.c can not read
5305 #    these in, so they should not go in $map_directory.  As of this writing,
5306 #    the only case that these happen is for named sequences used in
5307 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
5308 #    something else could come along that uses it.
5309 # 2) Specials are anything that doesn't fit syntactically into the body of the
5310 #    table.  The ranges for these have a map type of non-zero.  The code below
5311 #    knows about and handles each possible type.   In most cases, these are
5312 #    written as part of the header.
5313 #
5314 # A map table deliberately can't be manipulated at will unlike match tables.
5315 # This is because of the ambiguities having to do with what to do with
5316 # overlapping code points.  And there just isn't a need for those things;
5317 # what one wants to do is just query, add, replace, or delete mappings, plus
5318 # write the final result.
5319 # However, there is a method to get the list of possible ranges that aren't in
5320 # this table to use for defaulting missing code point mappings.  And,
5321 # map_add_or_replace_non_nulls() does allow one to add another table to this
5322 # one, but it is clearly very specialized, and defined that the other's
5323 # non-null values replace this one's if there is any overlap.
5324
5325 sub trace { return main::trace(@_); }
5326
5327 { # Closure
5328
5329     main::setup_package();
5330
5331     my %default_map;
5332     # Many input files omit some entries; this gives what the mapping for the
5333     # missing entries should be
5334     main::set_access('default_map', \%default_map, 'r');
5335
5336     my %anomalous_entries;
5337     # Things that go in the body of the table which don't fit the normal
5338     # scheme of things, like having a range.  Not much can be done with these
5339     # once there except to output them.  This was created to handle named
5340     # sequences.
5341     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5342     main::set_access('anomalous_entries',       # Append singular, read plural
5343                     \%anomalous_entries,
5344                     'readable_array');
5345
5346     my %core_access;
5347     # This is a string, solely for documentation, indicating how one can get
5348     # access to this property via the Perl core.
5349     main::set_access('core_access', \%core_access, 'r', 's');
5350
5351     my %to_output_map;
5352     # Boolean as to whether or not to write out this map table
5353     main::set_access('to_output_map', \%to_output_map, 's');
5354
5355
5356     sub new {
5357         my $class = shift;
5358         my $name = shift;
5359
5360         my %args = @_;
5361
5362         # Optional initialization data for the table.
5363         my $initialize = delete $args{'Initialize'};
5364
5365         my $core_access = delete $args{'Core_Access'};
5366         my $default_map = delete $args{'Default_Map'};
5367         my $property = delete $args{'_Property'};
5368         my $full_name = delete $args{'Full_Name'};
5369         # Rest of parameters passed on
5370
5371         my $range_list = Range_Map->new(Owner => $property);
5372
5373         my $self = $class->SUPER::new(
5374                                     Name => $name,
5375                                     Complete_Name =>  $full_name,
5376                                     Full_Name => $full_name,
5377                                     _Property => $property,
5378                                     _Range_List => $range_list,
5379                                     %args);
5380
5381         my $addr = do { no overloading; pack 'J', $self; };
5382
5383         $anomalous_entries{$addr} = [];
5384         $core_access{$addr} = $core_access;
5385         $default_map{$addr} = $default_map;
5386
5387         $self->initialize($initialize) if defined $initialize;
5388
5389         return $self;
5390     }
5391
5392     use overload
5393         fallback => 0,
5394         qw("") => "_operator_stringify",
5395     ;
5396
5397     sub _operator_stringify {
5398         my $self = shift;
5399
5400         my $name = $self->property->full_name;
5401         $name = '""' if $name eq "";
5402         return "Map table for Property '$name'";
5403     }
5404
5405     sub add_alias {
5406         # Add a synonym for this table (which means the property itself)
5407         my $self = shift;
5408         my $name = shift;
5409         # Rest of parameters passed on.
5410
5411         $self->SUPER::add_alias($name, $self->property, @_);
5412         return;
5413     }
5414
5415     sub add_map {
5416         # Add a range of code points to the list of specially-handled code
5417         # points.  $MULTI_CP is assumed if the type of special is not passed
5418         # in.
5419
5420         my $self = shift;
5421         my $lower = shift;
5422         my $upper = shift;
5423         my $string = shift;
5424         my %args = @_;
5425
5426         my $type = delete $args{'Type'} || 0;
5427         # Rest of parameters passed on
5428
5429         # Can't change the table if locked.
5430         return if $self->carp_if_locked;
5431
5432         my $addr = do { no overloading; pack 'J', $self; };
5433
5434         $self->_range_list->add_map($lower, $upper,
5435                                     $string,
5436                                     @_,
5437                                     Type => $type);
5438         return;
5439     }
5440
5441     sub append_to_body {
5442         # Adds to the written HERE document of the table's body any anomalous
5443         # entries in the table..
5444
5445         my $self = shift;
5446         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5447
5448         my $addr = do { no overloading; pack 'J', $self; };
5449
5450         return "" unless @{$anomalous_entries{$addr}};
5451         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5452     }
5453
5454     sub map_add_or_replace_non_nulls {
5455         # This adds the mappings in the table $other to $self.  Non-null
5456         # mappings from $other override those in $self.  It essentially merges
5457         # the two tables, with the second having priority except for null
5458         # mappings.
5459
5460         my $self = shift;
5461         my $other = shift;
5462         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5463
5464         return if $self->carp_if_locked;
5465
5466         if (! $other->isa(__PACKAGE__)) {
5467             Carp::my_carp_bug("$other should be a "
5468                         . __PACKAGE__
5469                         . ".  Not a '"
5470                         . ref($other)
5471                         . "'.  Not added;");
5472             return;
5473         }
5474
5475         my $addr = do { no overloading; pack 'J', $self; };
5476         my $other_addr = do { no overloading; pack 'J', $other; };
5477
5478         local $to_trace = 0 if main::DEBUG;
5479
5480         my $self_range_list = $self->_range_list;
5481         my $other_range_list = $other->_range_list;
5482         foreach my $range ($other_range_list->ranges) {
5483             my $value = $range->value;
5484             next if $value eq "";
5485             $self_range_list->_add_delete('+',
5486                                           $range->start,
5487                                           $range->end,
5488                                           $value,
5489                                           Type => $range->type,
5490                                           Replace => $UNCONDITIONALLY);
5491         }
5492
5493         return;
5494     }
5495
5496     sub set_default_map {
5497         # Define what code points that are missing from the input files should
5498         # map to
5499
5500         my $self = shift;
5501         my $map = shift;
5502         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5503
5504         my $addr = do { no overloading; pack 'J', $self; };
5505
5506         # Convert the input to the standard equivalent, if any (won't have any
5507         # for $STRING properties)
5508         my $standard = $self->_find_table_from_alias->{$map};
5509         $map = $standard->name if defined $standard;
5510
5511         # Warn if there already is a non-equivalent default map for this
5512         # property.  Note that a default map can be a ref, which means that
5513         # what it actually means is delayed until later in the program, and it
5514         # IS permissible to override it here without a message.
5515         my $default_map = $default_map{$addr};
5516         if (defined $default_map
5517             && ! ref($default_map)
5518             && $default_map ne $map
5519             && main::Standardize($map) ne $default_map)
5520         {
5521             my $property = $self->property;
5522             my $map_table = $property->table($map);
5523             my $default_table = $property->table($default_map);
5524             if (defined $map_table
5525                 && defined $default_table
5526                 && $map_table != $default_table)
5527             {
5528                 Carp::my_carp("Changing the default mapping for "
5529                             . $property
5530                             . " from $default_map to $map'");
5531             }
5532         }
5533
5534         $default_map{$addr} = $map;
5535
5536         # Don't also create any missing table for this map at this point,
5537         # because if we did, it could get done before the main table add is
5538         # done for PropValueAliases.txt; instead the caller will have to make
5539         # sure it exists, if desired.
5540         return;
5541     }
5542
5543     sub to_output_map {
5544         # Returns boolean: should we write this map table?
5545
5546         my $self = shift;
5547         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5548
5549         my $addr = do { no overloading; pack 'J', $self; };
5550
5551         # If overridden, use that
5552         return $to_output_map{$addr} if defined $to_output_map{$addr};
5553
5554         my $full_name = $self->full_name;
5555
5556         # If table says to output, do so; if says to suppress it, do do.
5557         return 1 if grep { $_ eq $full_name } @output_mapped_properties;
5558         return 0 if $self->status eq $SUPPRESSED;
5559
5560         my $type = $self->property->type;
5561
5562         # Don't want to output binary map tables even for debugging.
5563         return 0 if $type == $BINARY;
5564
5565         # But do want to output string ones.
5566         return 1 if $type == $STRING;
5567
5568         # Otherwise is an $ENUM, don't output it
5569         return 0;
5570     }
5571
5572     sub inverse_list {
5573         # Returns a Range_List that is gaps of the current table.  That is,
5574         # the inversion
5575
5576         my $self = shift;
5577         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5578
5579         my $current = Range_List->new(Initialize => $self->_range_list,
5580                                 Owner => $self->property);
5581         return ~ $current;
5582     }
5583
5584     sub set_final_comment {
5585         # Just before output, create the comment that heads the file
5586         # containing this table.
5587
5588         return unless $debugging_build;
5589
5590         my $self = shift;
5591         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5592
5593         # No sense generating a comment if aren't going to write it out.
5594         return if ! $self->to_output_map;
5595
5596         my $addr = do { no overloading; pack 'J', $self; };
5597
5598         my $property = $self->property;
5599
5600         # Get all the possible names for this property.  Don't use any that
5601         # aren't ok for use in a file name, etc.  This is perhaps causing that
5602         # flag to do double duty, and may have to be changed in the future to
5603         # have our own flag for just this purpose; but it works now to exclude
5604         # Perl generated synonyms from the lists for properties, where the
5605         # name is always the proper Unicode one.
5606         my @property_aliases = grep { $_->externally_ok } $self->aliases;
5607
5608         my $count = $self->count;
5609         my $default_map = $default_map{$addr};
5610
5611         # The ranges that map to the default aren't output, so subtract that
5612         # to get those actually output.  A property with matching tables
5613         # already has the information calculated.
5614         if ($property->type != $STRING) {
5615             $count -= $property->table($default_map)->count;
5616         }
5617         elsif (defined $default_map) {
5618
5619             # But for $STRING properties, must calculate now.  Subtract the
5620             # count from each range that maps to the default.
5621             foreach my $range ($self->_range_list->ranges) {
5622                 if ($range->value eq $default_map) {
5623                     $count -= $range->end +1 - $range->start;
5624                 }
5625             }
5626
5627         }
5628
5629         # Get a  string version of $count with underscores in large numbers,
5630         # for clarity.
5631         my $string_count = main::clarify_number($count);
5632
5633         my $code_points = ($count == 1)
5634                         ? 'single code point'
5635                         : "$string_count code points";
5636
5637         my $mapping;
5638         my $these_mappings;
5639         my $are;
5640         if (@property_aliases <= 1) {
5641             $mapping = 'mapping';
5642             $these_mappings = 'this mapping';
5643             $are = 'is'
5644         }
5645         else {
5646             $mapping = 'synonymous mappings';
5647             $these_mappings = 'these mappings';
5648             $are = 'are'
5649         }
5650         my $cp;
5651         if ($count >= $MAX_UNICODE_CODEPOINTS) {
5652             $cp = "any code point in Unicode Version $string_version";
5653         }
5654         else {
5655             my $map_to;
5656             if ($default_map eq "") {
5657                 $map_to = 'the null string';
5658             }
5659             elsif ($default_map eq $CODE_POINT) {
5660                 $map_to = "itself";
5661             }
5662             else {
5663                 $map_to = "'$default_map'";
5664             }
5665             if ($count == 1) {
5666                 $cp = "the single code point";
5667             }
5668             else {
5669                 $cp = "one of the $code_points";
5670             }
5671             $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5672         }
5673
5674         my $comment = "";
5675
5676         my $status = $self->status;
5677         if ($status) {
5678             my $warn = uc $status_past_participles{$status};
5679             $comment .= <<END;
5680
5681 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
5682  All property or property=value combinations contained in this file are $warn.
5683  See $unicode_reference_url for what this means.
5684
5685 END
5686         }
5687         $comment .= "This file returns the $mapping:\n";
5688
5689         for my $i (0 .. @property_aliases - 1) {
5690             $comment .= sprintf("%-8s%s\n",
5691                                 " ",
5692                                 $property_aliases[$i]->name . '(cp)'
5693                                 );
5694         }
5695         $comment .=
5696                 "\nwhere 'cp' is $cp.  Note that $these_mappings $are ";
5697
5698         my $access = $core_access{$addr};
5699         if ($access) {
5700             $comment .= "accessible through the Perl core via $access.";
5701         }
5702         else {
5703             $comment .= "not accessible through the Perl core directly.";
5704         }
5705
5706         # And append any commentary already set from the actual property.
5707         $comment .= "\n\n" . $self->comment if $self->comment;
5708         if ($self->description) {
5709             $comment .= "\n\n" . join " ", $self->description;
5710         }
5711         if ($self->note) {
5712             $comment .= "\n\n" . join " ", $self->note;
5713         }
5714         $comment .= "\n";
5715
5716         if (! $self->perl_extension) {
5717             $comment .= <<END;
5718
5719 For information about what this property really means, see:
5720 $unicode_reference_url
5721 END
5722         }
5723
5724         if ($count) {        # Format differs for empty table
5725                 $comment.= "\nThe format of the ";
5726             if ($self->range_size_1) {
5727                 $comment.= <<END;
5728 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5729 is in hex; MAPPING is what CODE_POINT maps to.
5730 END
5731             }
5732             else {
5733
5734                 # There are tables which end up only having one element per
5735                 # range, but it is not worth keeping track of for making just
5736                 # this comment a little better.
5737                 $comment.= <<END;
5738 non-comment portions of the main body of lines of this file is:
5739 START\\tSTOP\\tMAPPING where START is the starting code point of the
5740 range, in hex; STOP is the ending point, or if omitted, the range has just one
5741 code point; MAPPING is what each code point between START and STOP maps to.
5742 END
5743                 if ($self->output_range_counts) {
5744                     $comment .= <<END;
5745 Numbers in comments in [brackets] indicate how many code points are in the
5746 range (omitted when the range is a single code point or if the mapping is to
5747 the null string).
5748 END
5749                 }
5750             }
5751         }
5752         $self->set_comment(main::join_lines($comment));
5753         return;
5754     }
5755
5756     my %swash_keys; # Makes sure don't duplicate swash names.
5757
5758     # The remaining variables are temporaries used while writing each table,
5759     # to output special ranges.
5760     my $has_hangul_syllables;
5761     my @multi_code_point_maps;  # Map is to more than one code point.
5762
5763     # The key is the base name of the code point, and the value is an
5764     # array giving all the ranges that use this base name.  Each range
5765     # is actually a hash giving the 'low' and 'high' values of it.
5766     my %names_ending_in_code_point;
5767
5768     # Inverse mapping.  The list of ranges that have these kinds of
5769     # names.  Each element contains the low, high, and base names in a
5770     # hash.
5771     my @code_points_ending_in_code_point;
5772
5773     sub handle_special_range {
5774         # Called in the middle of write when it finds a range it doesn't know
5775         # how to handle.
5776
5777         my $self = shift;
5778         my $range = shift;
5779         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5780
5781         my $addr = do { no overloading; pack 'J', $self; };
5782
5783         my $type = $range->type;
5784
5785         my $low = $range->start;
5786         my $high = $range->end;
5787         my $map = $range->value;
5788
5789         # No need to output the range if it maps to the default.
5790         return if $map eq $default_map{$addr};
5791
5792         # Switch based on the map type...
5793         if ($type == $HANGUL_SYLLABLE) {
5794
5795             # These are entirely algorithmically determinable based on
5796             # some constants furnished by Unicode; for now, just set a
5797             # flag to indicate that have them.  After everything is figured
5798             # out, we will output the code that does the algorithm.
5799             $has_hangul_syllables = 1;
5800         }
5801         elsif ($type == $CP_IN_NAME) {
5802
5803             # Code points whose the name ends in their code point are also
5804             # algorithmically determinable, but need information about the map
5805             # to do so.  Both the map and its inverse are stored in data
5806             # structures output in the file.
5807             push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5808             push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5809
5810             push @code_points_ending_in_code_point, { low => $low,
5811                                                         high => $high,
5812                                                         name => $map
5813                                                     };
5814         }
5815         elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5816
5817             # Multi-code point maps and null string maps have an entry
5818             # for each code point in the range.  They use the same
5819             # output format.
5820             for my $code_point ($low .. $high) {
5821
5822                 # The pack() below can't cope with surrogates.
5823                 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
5824                     Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
5825                     next;
5826                 }
5827
5828                 # Generate the hash entries for these in the form that
5829                 # utf8.c understands.
5830                 my $tostr = "";
5831                 my $to_name = "";
5832                 my $to_chr = "";
5833                 foreach my $to (split " ", $map) {
5834                     if ($to !~ /^$code_point_re$/) {
5835                         Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
5836                         next;
5837                     }
5838                     $tostr .= sprintf "\\x{%s}", $to;
5839                     $to = CORE::hex $to;
5840                     if ($annotate) {
5841                         $to_name .= " + " if $to_name;
5842                         $to_chr .= chr($to);
5843                         main::populate_char_info($to)
5844                                             if ! defined $viacode[$to];
5845                         $to_name .=  $viacode[$to];
5846                     }
5847                 }
5848
5849                 # I (khw) have never waded through this line to
5850                 # understand it well enough to comment it.
5851                 my $utf8 = sprintf(qq["%s" => "$tostr",],
5852                         join("", map { sprintf "\\x%02X", $_ }
5853                             unpack("U0C*", pack("U", $code_point))));
5854
5855                 # Add a comment so that a human reader can more easily
5856                 # see what's going on.
5857                 push @multi_code_point_maps,
5858                         sprintf("%-45s # U+%04X", $utf8, $code_point);
5859                 if (! $annotate) {
5860                     $multi_code_point_maps[-1] .= " => $map";
5861                 }
5862                 else {
5863                     main::populate_char_info($code_point)
5864                                     if ! defined $viacode[$code_point];
5865                     $multi_code_point_maps[-1] .= " '"
5866                         . chr($code_point)
5867                         . "' => '$to_chr'; $viacode[$code_point] => $to_name";
5868                 }
5869             }
5870         }
5871         else {
5872             Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
5873         }
5874
5875         return;
5876     }
5877
5878     sub pre_body {
5879         # Returns the string that should be output in the file before the main
5880         # body of this table.  It isn't called until the main body is
5881         # calculated, saving a pass.  The string includes some hash entries
5882         # identifying the format of the body, and what the single value should
5883         # be for all ranges missing from it.  It also includes any code points
5884         # which have map_types that don't go in the main table.
5885
5886         my $self = shift;
5887         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5888
5889         my $addr = do { no overloading; pack 'J', $self; };
5890
5891         my $name = $self->property->swash_name;
5892
5893         if (defined $swash_keys{$name}) {
5894             Carp::my_carp(join_lines(<<END
5895 Already created a swash name '$name' for $swash_keys{$name}.  This means that
5896 the same name desired for $self shouldn't be used.  Bad News.  This must be
5897 fixed before production use, but proceeding anyway
5898 END
5899             ));
5900         }
5901         $swash_keys{$name} = "$self";
5902
5903         my $pre_body = "";
5904
5905         # Here we assume we were called after have gone through the whole
5906         # file.  If we actually generated anything for each map type, add its
5907         # respective header and trailer
5908         if (@multi_code_point_maps) {
5909             $pre_body .= <<END;
5910
5911 # Some code points require special handling because their mappings are each to
5912 # multiple code points.  These do not appear in the main body, but are defined
5913 # in the hash below.
5914
5915 # Each key is the string of N bytes that together make up the UTF-8 encoding
5916 # for the code point.  (i.e. the same as looking at the code point's UTF-8
5917 # under "use bytes").  Each value is the UTF-8 of the translation, for speed.
5918 %utf8::ToSpec$name = (
5919 END
5920             $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
5921         }
5922
5923         if ($has_hangul_syllables || @code_points_ending_in_code_point) {
5924
5925             # Convert these structures to output format.
5926             my $code_points_ending_in_code_point =
5927                 main::simple_dumper(\@code_points_ending_in_code_point,
5928                                     ' ' x 8);
5929             my $names = main::simple_dumper(\%names_ending_in_code_point,
5930                                             ' ' x 8);
5931
5932             # Do the same with the Hangul names,
5933             my $jamo;
5934             my $jamo_l;
5935             my $jamo_v;
5936             my $jamo_t;
5937             my $jamo_re;
5938             if ($has_hangul_syllables) {
5939
5940                 # Construct a regular expression of all the possible
5941                 # combinations of the Hangul syllables.
5942                 my @L_re;   # Leading consonants
5943                 for my $i ($LBase .. $LBase + $LCount - 1) {
5944                     push @L_re, $Jamo{$i}
5945                 }
5946                 my @V_re;   # Middle vowels
5947                 for my $i ($VBase .. $VBase + $VCount - 1) {
5948                     push @V_re, $Jamo{$i}
5949                 }
5950                 my @T_re;   # Trailing consonants
5951                 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
5952                     push @T_re, $Jamo{$i}
5953                 }
5954
5955                 # The whole re is made up of the L V T combination.
5956                 $jamo_re = '('
5957                             . join ('|', sort @L_re)
5958                             . ')('
5959                             . join ('|', sort @V_re)
5960                             . ')('
5961                             . join ('|', sort @T_re)
5962                             . ')?';
5963
5964                 # These hashes needed by the algorithm were generated
5965                 # during reading of the Jamo.txt file
5966                 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
5967                 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
5968                 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
5969                 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
5970             }
5971
5972             $pre_body .= <<END;
5973
5974 # To achieve significant memory savings when this file is read in,
5975 # algorithmically derivable code points are omitted from the main body below.
5976 # Instead, the following routines can be used to translate between name and
5977 # code point and vice versa
5978
5979 { # Closure
5980
5981     # Matches legal code point.  4-6 hex numbers, If there are 6, the
5982     # first two must be '10'; if there are 5, the first must not be a '0'.
5983     my \$code_point_re = qr/$code_point_re/;
5984
5985     # In the following hash, the keys are the bases of names which includes
5986     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The values
5987     # of each key is another hash which is used to get the low and high ends
5988     # for each range of code points that apply to the name
5989     my %names_ending_in_code_point = (
5990 $names
5991     );
5992
5993     # And the following array gives the inverse mapping from code points to
5994     # names.  Lowest code points are first
5995     my \@code_points_ending_in_code_point = (
5996 $code_points_ending_in_code_point
5997     );
5998 END
5999             # Earlier releases didn't have Jamos.  No sense outputting
6000             # them unless will be used.
6001             if ($has_hangul_syllables) {
6002                 $pre_body .= <<END;
6003
6004     # Convert from code point to Jamo short name for use in composing Hangul
6005     # syllable names
6006     my %Jamo = (
6007 $jamo
6008     );
6009
6010     # Leading consonant (can be null)
6011     my %Jamo_L = (
6012 $jamo_l
6013     );
6014
6015     # Vowel
6016     my %Jamo_V = (
6017 $jamo_v
6018     );
6019
6020     # Optional trailing consonant
6021     my %Jamo_T = (
6022 $jamo_t
6023     );
6024
6025     # Computed re that splits up a Hangul name into LVT or LV syllables
6026     my \$syllable_re = qr/$jamo_re/;
6027
6028     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
6029     my \$HANGUL_SYLLABLE_LENGTH = length \$HANGUL_SYLLABLE;
6030
6031     # These constants names and values were taken from the Unicode standard,
6032     # version 5.1, section 3.12.  They are used in conjunction with Hangul
6033     # syllables
6034     my \$SBase = $SBase_string;
6035     my \$LBase = $LBase_string;
6036     my \$VBase = $VBase_string;
6037     my \$TBase = $TBase_string;
6038     my \$SCount = $SCount;
6039     my \$LCount = $LCount;
6040     my \$VCount = $VCount;
6041     my \$TCount = $TCount;
6042     my \$NCount = \$VCount * \$TCount;
6043 END
6044             } # End of has Jamos
6045
6046             $pre_body .= << 'END';
6047
6048     sub name_to_code_point_special {
6049         my $name = shift;
6050
6051         # Returns undef if not one of the specially handled names; otherwise
6052         # returns the code point equivalent to the input name
6053 END
6054             if ($has_hangul_syllables) {
6055                 $pre_body .= << 'END';
6056
6057         if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
6058             $name = substr($name, $HANGUL_SYLLABLE_LENGTH);
6059             return if $name !~ qr/^$syllable_re$/;
6060             my $L = $Jamo_L{$1};
6061             my $V = $Jamo_V{$2};
6062             my $T = (defined $3) ? $Jamo_T{$3} : 0;
6063             return ($L * $VCount + $V) * $TCount + $T + $SBase;
6064         }
6065 END
6066             }
6067             $pre_body .= << 'END';
6068
6069         # Name must end in '-code_point' for this to handle.
6070         if ($name !~ /^ (.*) - ($code_point_re) $/x) {
6071             return;
6072         }
6073
6074         my $base = $1;
6075         my $code_point = CORE::hex $2;
6076
6077         # Name must be one of the ones which has the code point in it.
6078         return if ! $names_ending_in_code_point{$base};
6079
6080         # Look through the list of ranges that apply to this name to see if
6081         # the code point is in one of them.
6082         for (my $i = 0; $i < scalar @{$names_ending_in_code_point{$base}{'low'}}; $i++) {
6083             return if $names_ending_in_code_point{$base}{'low'}->[$i] > $code_point;
6084             next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point;
6085
6086             # Here, the code point is in the range.
6087             return $code_point;
6088         }
6089
6090         # Here, looked like the name had a code point number in it, but
6091         # did not match one of the valid ones.
6092         return;
6093     }
6094
6095     sub code_point_to_name_special {
6096         my $code_point = shift;
6097
6098         # Returns the name of a code point if algorithmically determinable;
6099         # undef if not
6100 END
6101             if ($has_hangul_syllables) {
6102                 $pre_body .= << 'END';
6103
6104         # If in the Hangul range, calculate the name based on Unicode's
6105         # algorithm
6106         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
6107             use integer;
6108             my $SIndex = $code_point - $SBase;
6109             my $L = $LBase + $SIndex / $NCount;
6110             my $V = $VBase + ($SIndex % $NCount) / $TCount;
6111             my $T = $TBase + $SIndex % $TCount;
6112             $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
6113             $name .= $Jamo{$T} if $T != $TBase;
6114             return $name;
6115         }
6116 END
6117             }
6118             $pre_body .= << 'END';
6119
6120         # Look through list of these code points for one in range.
6121         foreach my $hash (@code_points_ending_in_code_point) {
6122             return if $code_point < $hash->{'low'};
6123             if ($code_point <= $hash->{'high'}) {
6124                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
6125             }
6126         }
6127         return;            # None found
6128     }
6129 } # End closure
6130
6131 END
6132         } # End of has hangul or code point in name maps.
6133
6134         my $format = $self->format;
6135
6136         my $return = <<END;
6137 # The name this swash is to be known by, with the format of the mappings in
6138 # the main body of the table, and what all code points missing from this file
6139 # map to.
6140 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6141 END
6142         my $default_map = $default_map{$addr};
6143         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
6144
6145         if ($default_map eq $CODE_POINT) {
6146             $return .= ' # code point maps to itself';
6147         }
6148         elsif ($default_map eq "") {
6149             $return .= ' # code point maps to the null string';
6150         }
6151         $return .= "\n";
6152
6153         $return .= $pre_body;
6154
6155         return $return;
6156     }
6157
6158     sub write {
6159         # Write the table to the file.
6160
6161         my $self = shift;
6162         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6163
6164         my $addr = do { no overloading; pack 'J', $self; };
6165
6166         # Clear the temporaries
6167         $has_hangul_syllables = 0;
6168         undef @multi_code_point_maps;
6169         undef %names_ending_in_code_point;
6170         undef @code_points_ending_in_code_point;
6171
6172         # Calculate the format of the table if not already done.
6173         my $format = $self->format;
6174         my $type = $self->property->type;
6175         my $default_map = $self->default_map;
6176         if (! defined $format) {
6177             if ($type == $BINARY) {
6178
6179                 # Don't bother checking the values, because we elsewhere
6180                 # verify that a binary table has only 2 values.
6181                 $format = $BINARY_FORMAT;
6182             }
6183             else {
6184                 my @ranges = $self->_range_list->ranges;
6185
6186                 # default an empty table based on its type and default map
6187                 if (! @ranges) {
6188
6189                     # But it turns out that the only one we can say is a
6190                     # non-string (besides binary, handled above) is when the
6191                     # table is a string and the default map is to a code point
6192                     if ($type == $STRING && $default_map eq $CODE_POINT) {
6193                         $format = $HEX_FORMAT;
6194                     }
6195                     else {
6196                         $format = $STRING_FORMAT;
6197                     }
6198                 }
6199                 else {
6200
6201                     # Start with the most restrictive format, and as we find
6202                     # something that doesn't fit with that, change to the next
6203                     # most restrictive, and so on.
6204                     $format = $DECIMAL_FORMAT;
6205                     foreach my $range (@ranges) {
6206                         next if $range->type != 0;  # Non-normal ranges don't
6207                                                     # affect the main body
6208                         my $map = $range->value;
6209                         if ($map ne $default_map) {
6210                             last if $format eq $STRING_FORMAT;  # already at
6211                                                                 # least
6212                                                                 # restrictive
6213                             $format = $INTEGER_FORMAT
6214                                                 if $format eq $DECIMAL_FORMAT
6215                                                     && $map !~ / ^ [0-9] $ /x;
6216                             $format = $FLOAT_FORMAT
6217                                             if $format eq $INTEGER_FORMAT
6218                                                 && $map !~ / ^ -? [0-9]+ $ /x;
6219                             $format = $RATIONAL_FORMAT
6220                                 if $format eq $FLOAT_FORMAT
6221                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6222                             $format = $HEX_FORMAT
6223                             if $format eq $RATIONAL_FORMAT
6224                                 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
6225                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6226                                                        && $map =~ /[^0-9A-F]/;
6227                         }
6228                     }
6229                 }
6230             }
6231         } # end of calculating format
6232
6233         if ($default_map eq $CODE_POINT
6234             && $format ne $HEX_FORMAT
6235             && ! defined $self->format)    # manual settings are always
6236                                            # considered ok
6237         {
6238             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6239         }
6240
6241         $self->_set_format($format);
6242
6243         return $self->SUPER::write(
6244             ($self->property == $block)
6245                 ? 7     # block file needs more tab stops
6246                 : 3,
6247             $default_map);   # don't write defaulteds
6248     }
6249
6250     # Accessors for the underlying list that should fail if locked.
6251     for my $sub (qw(
6252                     add_duplicate
6253                 ))
6254     {
6255         no strict "refs";
6256         *$sub = sub {
6257             use strict "refs";
6258             my $self = shift;
6259
6260             return if $self->carp_if_locked;
6261             return $self->_range_list->$sub(@_);
6262         }
6263     }
6264 } # End closure for Map_Table
6265
6266 package Match_Table;
6267 use base '_Base_Table';
6268
6269 # A Match table is one which is a list of all the code points that have
6270 # the same property and property value, for use in \p{property=value}
6271 # constructs in regular expressions.  It adds very little data to the base
6272 # structure, but many methods, as these lists can be combined in many ways to
6273 # form new ones.
6274 # There are only a few concepts added:
6275 # 1) Equivalents and Relatedness.
6276 #    Two tables can match the identical code points, but have different names.
6277 #    This always happens when there is a perl single form extension
6278 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
6279 #    tables are set to be related, with the Perl extension being a child, and
6280 #    the Unicode property being the parent.
6281 #
6282 #    It may be that two tables match the identical code points and we don't
6283 #    know if they are related or not.  This happens most frequently when the
6284 #    Block and Script properties have the exact range.  But note that a
6285 #    revision to Unicode could add new code points to the script, which would
6286 #    now have to be in a different block (as the block was filled, or there
6287 #    would have been 'Unknown' script code points in it and they wouldn't have
6288 #    been identical).  So we can't rely on any two properties from Unicode
6289 #    always matching the same code points from release to release, and thus
6290 #    these tables are considered coincidentally equivalent--not related.  When
6291 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
6292 #    'leader', and the others are 'equivalents'.  This concept is useful
6293 #    to minimize the number of tables written out.  Only one file is used for
6294 #    any identical set of code points, with entries in Heavy.pl mapping all
6295 #    the involved tables to it.
6296 #
6297 #    Related tables will always be identical; we set them up to be so.  Thus
6298 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
6299 #    unrelated tables.  Relatedness makes generating the documentation easier.
6300 #
6301 # 2) Conflicting.  It may be that there will eventually be name clashes, with
6302 #    the same name meaning different things.  For a while, there actually were
6303 #    conflicts, but they have so far been resolved by changing Perl's or
6304 #    Unicode's definitions to match the other, but when this code was written,
6305 #    it wasn't clear that that was what was going to happen.  (Unicode changed
6306 #    because of protests during their beta period.)  Name clashes are warned
6307 #    about during compilation, and the documentation.  The generated tables
6308 #    are sane, free of name clashes, because the code suppresses the Perl
6309 #    version.  But manual intervention to decide what the actual behavior
6310 #    should be may be required should this happen.  The introductory comments
6311 #    have more to say about this.
6312
6313 sub standardize { return main::standardize($_[0]); }
6314 sub trace { return main::trace(@_); }
6315
6316
6317 { # Closure
6318
6319     main::setup_package();
6320
6321     my %leader;
6322     # The leader table of this one; initially $self.
6323     main::set_access('leader', \%leader, 'r');
6324
6325     my %equivalents;
6326     # An array of any tables that have this one as their leader
6327     main::set_access('equivalents', \%equivalents, 'readable_array');
6328
6329     my %parent;
6330     # The parent table to this one, initially $self.  This allows us to
6331     # distinguish between equivalent tables that are related, and those which
6332     # may not be, but share the same output file because they match the exact
6333     # same set of code points in the current Unicode release.
6334     main::set_access('parent', \%parent, 'r');
6335
6336     my %children;
6337     # An array of any tables that have this one as their parent
6338     main::set_access('children', \%children, 'readable_array');
6339
6340     my %conflicting;
6341     # Array of any tables that would have the same name as this one with
6342     # a different meaning.  This is used for the generated documentation.
6343     main::set_access('conflicting', \%conflicting, 'readable_array');
6344
6345     my %matches_all;
6346     # Set in the constructor for tables that are expected to match all code
6347     # points.
6348     main::set_access('matches_all', \%matches_all, 'r');
6349
6350     sub new {
6351         my $class = shift;
6352
6353         my %args = @_;
6354
6355         # The property for which this table is a listing of property values.
6356         my $property = delete $args{'_Property'};
6357
6358         my $name = delete $args{'Name'};
6359         my $full_name = delete $args{'Full_Name'};
6360         $full_name = $name if ! defined $full_name;
6361
6362         # Optional
6363         my $initialize = delete $args{'Initialize'};
6364         my $matches_all = delete $args{'Matches_All'} || 0;
6365         my $format = delete $args{'Format'};
6366         # Rest of parameters passed on.
6367
6368         my $range_list = Range_List->new(Initialize => $initialize,
6369                                          Owner => $property);
6370
6371         my $complete = $full_name;
6372         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
6373                                               # but this helps debug if it
6374                                               # does
6375         # The complete name for a match table includes it's property in a
6376         # compound form 'property=table', except if the property is the
6377         # pseudo-property, perl, in which case it is just the single form,
6378         # 'table' (If you change the '=' must also change the ':' in lots of
6379         # places in this program that assume an equal sign)
6380         $complete = $property->full_name . "=$complete" if $property != $perl;
6381
6382         my $self = $class->SUPER::new(%args,
6383                                       Name => $name,
6384                                       Complete_Name => $complete,
6385                                       Full_Name => $full_name,
6386                                       _Property => $property,
6387                                       _Range_List => $range_list,
6388                                       Format => $EMPTY_FORMAT,
6389                                       );
6390         my $addr = do { no overloading; pack 'J', $self; };
6391
6392         $conflicting{$addr} = [ ];
6393         $equivalents{$addr} = [ ];
6394         $children{$addr} = [ ];
6395         $matches_all{$addr} = $matches_all;
6396         $leader{$addr} = $self;
6397         $parent{$addr} = $self;
6398
6399         if (defined $format && $format ne $EMPTY_FORMAT) {
6400             Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
6401         }
6402
6403         return $self;
6404     }
6405
6406     # See this program's beginning comment block about overloading these.
6407     use overload
6408         fallback => 0,
6409         qw("") => "_operator_stringify",
6410         '=' => sub {
6411                     my $self = shift;
6412
6413                     return if $self->carp_if_locked;
6414                     return $self;
6415                 },
6416
6417         '+' => sub {
6418                         my $self = shift;
6419                         my $other = shift;
6420
6421                         return $self->_range_list + $other;
6422                     },
6423         '&' => sub {
6424                         my $self = shift;
6425                         my $other = shift;
6426
6427                         return $self->_range_list & $other;
6428                     },
6429         '+=' => sub {
6430                         my $self = shift;
6431                         my $other = shift;
6432
6433                         return if $self->carp_if_locked;
6434
6435                         my $addr = do { no overloading; pack 'J', $self; };
6436
6437                         if (ref $other) {
6438
6439                             # Change the range list of this table to be the
6440                             # union of the two.
6441                             $self->_set_range_list($self->_range_list
6442                                                     + $other);
6443                         }
6444                         else {    # $other is just a simple value
6445                             $self->add_range($other, $other);
6446                         }
6447                         return $self;
6448                     },
6449         '-' => sub { my $self = shift;
6450                     my $other = shift;
6451                     my $reversed = shift;
6452
6453                     if ($reversed) {
6454                         Carp::my_carp_bug("Can't cope with a "
6455                             .  __PACKAGE__
6456                             . " being the first parameter in a '-'.  Subtraction ignored.");
6457                         return;
6458                     }
6459
6460                     return $self->_range_list - $other;
6461                 },
6462         '~' => sub { my $self = shift;
6463                     return ~ $self->_range_list;
6464                 },
6465     ;
6466
6467     sub _operator_stringify {
6468         my $self = shift;
6469
6470         my $name = $self->complete_name;
6471         return "Table '$name'";
6472     }
6473
6474     sub add_alias {
6475         # Add a synonym for this table.  See the comments in the base class
6476
6477         my $self = shift;
6478         my $name = shift;
6479         # Rest of parameters passed on.
6480
6481         $self->SUPER::add_alias($name, $self, @_);
6482         return;
6483     }
6484
6485     sub add_conflicting {
6486         # Add the name of some other object to the list of ones that name
6487         # clash with this match table.
6488
6489         my $self = shift;
6490         my $conflicting_name = shift;   # The name of the conflicting object
6491         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
6492         my $conflicting_object = shift; # Optional, the conflicting object
6493                                         # itself.  This is used to
6494                                         # disambiguate the text if the input
6495                                         # name is identical to any of the
6496                                         # aliases $self is known by.
6497                                         # Sometimes the conflicting object is
6498                                         # merely hypothetical, so this has to
6499                                         # be an optional parameter.
6500         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6501
6502         my $addr = do { no overloading; pack 'J', $self; };
6503
6504         # Check if the conflicting name is exactly the same as any existing
6505         # alias in this table (as long as there is a real object there to
6506         # disambiguate with).
6507         if (defined $conflicting_object) {
6508             foreach my $alias ($self->aliases) {
6509                 if ($alias->name eq $conflicting_name) {
6510
6511                     # Here, there is an exact match.  This results in
6512                     # ambiguous comments, so disambiguate by changing the
6513                     # conflicting name to its object's complete equivalent.
6514                     $conflicting_name = $conflicting_object->complete_name;
6515                     last;
6516                 }
6517             }
6518         }
6519
6520         # Convert to the \p{...} final name
6521         $conflicting_name = "\\$p" . "{$conflicting_name}";
6522
6523         # Only add once
6524         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6525
6526         push @{$conflicting{$addr}}, $conflicting_name;
6527
6528         return;
6529     }
6530
6531     sub is_set_equivalent_to {
6532         # Return boolean of whether or not the other object is a table of this
6533         # type and has been marked equivalent to this one.
6534
6535         my $self = shift;
6536         my $other = shift;
6537         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6538
6539         return 0 if ! defined $other; # Can happen for incomplete early
6540                                       # releases
6541         unless ($other->isa(__PACKAGE__)) {
6542             my $ref_other = ref $other;
6543             my $ref_self = ref $self;
6544             Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
6545             return 0;
6546         }
6547
6548         # Two tables are equivalent if they have the same leader.
6549         no overloading;
6550         return $leader{pack 'J', $self} == $leader{pack 'J', $other};
6551         return;
6552     }
6553
6554     sub set_equivalent_to {
6555         # Set $self equivalent to the parameter table.
6556         # The required Related => 'x' parameter is a boolean indicating
6557         # whether these tables are related or not.  If related, $other becomes
6558         # the 'parent' of $self; if unrelated it becomes the 'leader'
6559         #
6560         # Related tables share all characteristics except names; equivalents
6561         # not quite so many.
6562         # If they are related, one must be a perl extension.  This is because
6563         # we can't guarantee that Unicode won't change one or the other in a
6564         # later release even if they are identical now.
6565
6566         my $self = shift;
6567         my $other = shift;
6568
6569         my %args = @_;
6570         my $related = delete $args{'Related'};
6571
6572         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6573
6574         return if ! defined $other;     # Keep on going; happens in some early
6575                                         # Unicode releases.
6576
6577         if (! defined $related) {
6578             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
6579             $related = 0;
6580         }
6581
6582         # If already are equivalent, no need to re-do it;  if subroutine
6583         # returns null, it found an error, also do nothing
6584         my $are_equivalent = $self->is_set_equivalent_to($other);
6585         return if ! defined $are_equivalent || $are_equivalent;
6586
6587         my $addr = do { no overloading; pack 'J', $self; };
6588         my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
6589
6590         if ($related) {
6591             if ($current_leader->perl_extension) {
6592                 if ($other->perl_extension) {
6593                     Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
6594                     return;
6595                 }
6596             } elsif (! $other->perl_extension) {
6597                 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
6598                 $related = 0;
6599             }
6600         }
6601
6602         if (! $self->is_empty && ! $self->matches_identically_to($other)) {
6603             Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
6604             return;
6605         }
6606
6607         my $leader = do { no overloading; pack 'J', $current_leader; };
6608         my $other_addr = do { no overloading; pack 'J', $other; };
6609
6610         # Any tables that are equivalent to or children of this table must now
6611         # instead be equivalent to or (children) to the new leader (parent),
6612         # still equivalent.  The equivalency includes their matches_all info,
6613         # and for related tables, their status
6614         # All related tables are of necessity equivalent, but the converse
6615         # isn't necessarily true
6616         my $status = $other->status;
6617         my $status_info = $other->status_info;
6618         my $matches_all = $matches_all{other_addr};
6619         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6620             next if $table == $other;
6621             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6622
6623             my $table_addr = do { no overloading; pack 'J', $table; };
6624             $leader{$table_addr} = $other;
6625             $matches_all{$table_addr} = $matches_all;
6626             $self->_set_range_list($other->_range_list);
6627             push @{$equivalents{$other_addr}}, $table;
6628             if ($related) {
6629                 $parent{$table_addr} = $other;
6630                 push @{$children{$other_addr}}, $table;
6631                 $table->set_status($status, $status_info);
6632             }
6633         }
6634
6635         # Now that we've declared these to be equivalent, any changes to one
6636         # of the tables would invalidate that equivalency.
6637         $self->lock;
6638         $other->lock;
6639         return;
6640     }
6641
6642     sub add_range { # Add a range to the list for this table.
6643         my $self = shift;
6644         # Rest of parameters passed on
6645
6646         return if $self->carp_if_locked;
6647         return $self->_range_list->add_range(@_);
6648     }
6649
6650     sub pre_body {  # Does nothing for match tables.
6651         return
6652     }
6653
6654     sub append_to_body {  # Does nothing for match tables.
6655         return
6656     }
6657
6658     sub write {
6659         my $self = shift;
6660         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6661
6662         return $self->SUPER::write(2); # 2 tab stops
6663     }
6664
6665     sub set_final_comment {
6666         # This creates a comment for the file that is to hold the match table
6667         # $self.  It is somewhat convoluted to make the English read nicely,
6668         # but, heh, it's just a comment.
6669         # This should be called only with the leader match table of all the
6670         # ones that share the same file.  It lists all such tables, ordered so
6671         # that related ones are together.
6672
6673         return unless $debugging_build;
6674
6675         my $leader = shift;   # Should only be called on the leader table of
6676                               # an equivalent group
6677         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6678
6679         my $addr = do { no overloading; pack 'J', $leader; };
6680
6681         if ($leader{$addr} != $leader) {
6682             Carp::my_carp_bug(<<END
6683 set_final_comment() must be called on a leader table, which $leader is not.
6684 It is equivalent to $leader{$addr}.  No comment created
6685 END
6686             );
6687             return;
6688         }
6689
6690         # Get the number of code points matched by each of the tables in this
6691         # file, and add underscores for clarity.
6692         my $count = $leader->count;
6693         my $string_count = main::clarify_number($count);
6694
6695         my $loose_count = 0;        # how many aliases loosely matched
6696         my $compound_name = "";     # ? Are any names compound?, and if so, an
6697                                     # example
6698         my $properties_with_compound_names = 0;    # count of these
6699
6700
6701         my %flags;              # The status flags used in the file
6702         my $total_entries = 0;  # number of entries written in the comment
6703         my $matches_comment = ""; # The portion of the comment about the
6704                                   # \p{}'s
6705         my @global_comments;    # List of all the tables' comments that are
6706                                 # there before this routine was called.
6707
6708         # Get list of all the parent tables that are equivalent to this one
6709         # (including itself).
6710         my @parents = grep { $parent{main::objaddr $_} == $_ }
6711                             main::uniques($leader, @{$equivalents{$addr}});
6712         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
6713                                               # tables
6714
6715         for my $parent (@parents) {
6716
6717             my $property = $parent->property;
6718
6719             # Special case 'N' tables in properties with two match tables when
6720             # the other is a 'Y' one.  These are likely to be binary tables,
6721             # but not necessarily.  In either case, \P{} will match the
6722             # complement of \p{}, and so if something is a synonym of \p, the
6723             # complement of that something will be the synonym of \P.  This
6724             # would be true of any property with just two match tables, not
6725             # just those whose values are Y and N; but that would require a
6726             # little extra work, and there are none such so far in Unicode.
6727             my $perl_p = 'p';        # which is it?  \p{} or \P{}
6728             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
6729
6730             if (scalar $property->tables == 2
6731                 && $parent == $property->table('N')
6732                 && defined (my $yes = $property->table('Y')))
6733             {
6734                 my $yes_addr = do { no overloading; pack 'J', $yes; };
6735                 @yes_perl_synonyms
6736                     = grep { $_->property == $perl }
6737                                     main::uniques($yes,
6738                                                 $parent{$yes_addr},
6739                                                 $parent{$yes_addr}->children);
6740
6741                 # But these synonyms are \P{} ,not \p{}
6742                 $perl_p = 'P';
6743             }
6744
6745             my @description;        # Will hold the table description
6746             my @note;               # Will hold the table notes.
6747             my @conflicting;        # Will hold the table conflicts.
6748
6749             # Look at the parent, any yes synonyms, and all the children
6750             my $parent_addr = do { no overloading; pack 'J', $parent; };
6751             for my $table ($parent,
6752                            @yes_perl_synonyms,
6753                            @{$children{$parent_addr}})
6754             {
6755                 my $table_addr = do { no overloading; pack 'J', $table; };
6756                 my $table_property = $table->property;
6757
6758                 # Tables are separated by a blank line to create a grouping.
6759                 $matches_comment .= "\n" if $matches_comment;
6760
6761                 # The table is named based on the property and value
6762                 # combination it is for, like script=greek.  But there may be
6763                 # a number of synonyms for each side, like 'sc' for 'script',
6764                 # and 'grek' for 'greek'.  Any combination of these is a valid
6765                 # name for this table.  In this case, there are three more,
6766                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
6767                 # listing all possible combinations in the comment, we make
6768                 # sure that each synonym occurs at least once, and add
6769                 # commentary that the other combinations are possible.
6770                 my @property_aliases = $table_property->aliases;
6771                 my @table_aliases = $table->aliases;
6772
6773                 Carp::my_carp_bug("$table doesn't have any names.  Proceeding anyway.") unless @table_aliases;
6774
6775                 # The alias lists above are already ordered in the order we
6776                 # want to output them.  To ensure that each synonym is listed,
6777                 # we must use the max of the two numbers.
6778                 my $listed_combos = main::max(scalar @table_aliases,
6779                                                 scalar @property_aliases);
6780                 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6781
6782                 my $property_had_compound_name = 0;
6783
6784                 for my $i (0 .. $listed_combos - 1) {
6785                     $total_entries++;
6786
6787                     # The current alias for the property is the next one on
6788                     # the list, or if beyond the end, start over.  Similarly
6789                     # for the table (\p{prop=table})
6790                     my $property_alias = $property_aliases
6791                                             [$i % @property_aliases]->name;
6792                     my $table_alias_object = $table_aliases
6793                                                         [$i % @table_aliases];
6794                     my $table_alias = $table_alias_object->name;
6795                     my $loose_match = $table_alias_object->loose_match;
6796
6797                     if ($table_alias !~ /\D/) { # Clarify large numbers.
6798                         $table_alias = main::clarify_number($table_alias)
6799                     }
6800
6801                     # Add a comment for this alias combination
6802                     my $current_match_comment;
6803                     if ($table_property == $perl) {
6804                         $current_match_comment = "\\$perl_p"
6805                                                     . "{$table_alias}";
6806                     }
6807                     else {
6808                         $current_match_comment
6809                                         = "\\p{$property_alias=$table_alias}";
6810                         $property_had_compound_name = 1;
6811                     }
6812
6813                     # Flag any abnormal status for this table.
6814                     my $flag = $property->status
6815                                 || $table->status
6816                                 || $table_alias_object->status;
6817                     if ($flag) {
6818                         if ($flag ne $PLACEHOLDER) {
6819                             $flags{$flag} = $status_past_participles{$flag};
6820                         } else {
6821                             $flags{$flag} = <<END;
6822 a placeholder because it is not in Version $string_version of Unicode, but is
6823 needed by the Perl core to work gracefully.  Because it is not in this version
6824 of Unicode, it will not be listed in $pod_file.pod
6825 END
6826                         }
6827                     }
6828
6829                     $loose_count++;
6830
6831                     # Pretty up the comment.  Note the \b; it says don't make
6832                     # this line a continuation.
6833                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
6834                                         $flag,
6835                                         " " x 7,
6836                                         $current_match_comment);
6837                 } # End of generating the entries for this table.
6838
6839                 # Save these for output after this group of related tables.
6840                 push @description, $table->description;
6841                 push @note, $table->note;
6842                 push @conflicting, $table->conflicting;
6843
6844                 # And this for output after all the tables.
6845                 push @global_comments, $table->comment;
6846
6847                 # Compute an alternate compound name using the final property
6848                 # synonym and the first table synonym with a colon instead of
6849                 # the equal sign used elsewhere.
6850                 if ($property_had_compound_name) {
6851                     $properties_with_compound_names ++;
6852                     if (! $compound_name || @property_aliases > 1) {
6853                         $compound_name = $property_aliases[-1]->name
6854                                         . ': '
6855                                         . $table_aliases[0]->name;
6856                     }
6857                 }
6858             } # End of looping through all children of this table
6859
6860             # Here have assembled in $matches_comment all the related tables
6861             # to the current parent (preceded by the same info for all the
6862             # previous parents).  Put out information that applies to all of
6863             # the current family.
6864             if (@conflicting) {
6865
6866                 # But output the conflicting information now, as it applies to
6867                 # just this table.
6868                 my $conflicting = join ", ", @conflicting;
6869                 if ($conflicting) {
6870                     $matches_comment .= <<END;
6871
6872     Note that contrary to what you might expect, the above is NOT the same as
6873 END
6874                     $matches_comment .= "any of: " if @conflicting > 1;
6875                     $matches_comment .= "$conflicting\n";
6876                 }
6877             }
6878             if (@description) {
6879                 $matches_comment .= "\n    Meaning: "
6880                                     . join('; ', @description)
6881                                     . "\n";
6882             }
6883             if (@note) {
6884                 $matches_comment .= "\n    Note: "
6885                                     . join("\n    ", @note)
6886                                     . "\n";
6887             }
6888         } # End of looping through all tables
6889
6890
6891         my $code_points;
6892         my $match;
6893         my $any_of_these;
6894         if ($count == 1) {
6895             $match = 'matches';
6896             $code_points = 'single code point';
6897         }
6898         else {
6899             $match = 'match';
6900             $code_points = "$string_count code points";
6901         }
6902
6903         my $synonyms;
6904         my $entries;
6905         if ($total_entries <= 1) {
6906             $synonyms = "";
6907             $entries = 'entry';
6908             $any_of_these = 'this'
6909         }
6910         else {
6911             $synonyms = " any of the following regular expression constructs";
6912             $entries = 'entries';
6913             $any_of_these = 'any of these'
6914         }
6915
6916         my $comment = "";
6917         if ($has_unrelated) {
6918             $comment .= <<END;
6919 This file is for tables that are not necessarily related:  To conserve
6920 resources, every table that matches the identical set of code points in this
6921 version of Unicode uses this file.  Each one is listed in a separate group
6922 below.  It could be that the tables will match the same set of code points in
6923 other Unicode releases, or it could be purely coincidence that they happen to
6924 be the same in Unicode $string_version, and hence may not in other versions.
6925
6926 END
6927         }
6928
6929         if (%flags) {
6930             foreach my $flag (sort keys %flags) {
6931                 $comment .= <<END;
6932 '$flag' below means that this form is $flags{$flag}.
6933 END
6934                 next if $flag eq $PLACEHOLDER;
6935                 $comment .= "Consult $pod_file.pod\n";
6936             }
6937             $comment .= "\n";
6938         }
6939
6940         $comment .= <<END;
6941 This file returns the $code_points in Unicode Version $string_version that
6942 $match$synonyms:
6943
6944 $matches_comment
6945 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
6946 including if adding or subtracting white space, underscore, and hyphen
6947 characters matters or doesn't matter, and other permissible syntactic
6948 variants.  Upper/lower case distinctions never matter.
6949 END
6950
6951         if ($compound_name) {
6952             $comment .= <<END;
6953
6954 A colon can be substituted for the equals sign, and
6955 END
6956             if ($properties_with_compound_names > 1) {
6957                 $comment .= <<END;
6958 within each group above,
6959 END
6960             }
6961             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
6962
6963             # Note the \b below, it says don't make that line a continuation.
6964             $comment .= <<END;
6965 anything to the left of the equals (or colon) can be combined with anything to
6966 the right.  Thus, for example,
6967 $compound_name
6968 \bis also valid.
6969 END
6970         }
6971
6972         # And append any comment(s) from the actual tables.  They are all
6973         # gathered here, so may not read all that well.
6974         if (@global_comments) {
6975             $comment .= "\n" . join("\n\n", @global_comments) . "\n";
6976         }
6977
6978         if ($count) {   # The format differs if no code points, and needs no
6979                         # explanation in that case
6980                 $comment.= <<END;
6981
6982 The format of the lines of this file is:
6983 END
6984             $comment.= <<END;
6985 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
6986 STOP is the ending point, or if omitted, the range has just one code point.
6987 END
6988             if ($leader->output_range_counts) {
6989                 $comment .= <<END;
6990 Numbers in comments in [brackets] indicate how many code points are in the
6991 range.
6992 END
6993             }
6994         }
6995
6996         $leader->set_comment(main::join_lines($comment));
6997         return;
6998     }
6999
7000     # Accessors for the underlying list
7001     for my $sub (qw(
7002                     get_valid_code_point
7003                     get_invalid_code_point
7004                 ))
7005     {
7006         no strict "refs";
7007         *$sub = sub {
7008             use strict "refs";
7009             my $self = shift;
7010
7011             return $self->_range_list->$sub(@_);
7012         }
7013     }
7014 } # End closure for Match_Table
7015
7016 package Property;
7017
7018 # The Property class represents a Unicode property, or the $perl
7019 # pseudo-property.  It contains a map table initialized empty at construction
7020 # time, and for properties accessible through regular expressions, various
7021 # match tables, created through the add_match_table() method, and referenced
7022 # by the table('NAME') or tables() methods, the latter returning a list of all
7023 # of the match tables.  Otherwise table operations implicitly are for the map
7024 # table.
7025 #
7026 # Most of the data in the property is actually about its map table, so it
7027 # mostly just uses that table's accessors for most methods.  The two could
7028 # have been combined into one object, but for clarity because of their
7029 # differing semantics, they have been kept separate.  It could be argued that
7030 # the 'file' and 'directory' fields should be kept with the map table.
7031 #
7032 # Each property has a type.  This can be set in the constructor, or in the
7033 # set_type accessor, but mostly it is figured out by the data.  Every property
7034 # starts with unknown type, overridden by a parameter to the constructor, or
7035 # as match tables are added, or ranges added to the map table, the data is
7036 # inspected, and the type changed.  After the table is mostly or entirely
7037 # filled, compute_type() should be called to finalize they analysis.
7038 #
7039 # There are very few operations defined.  One can safely remove a range from
7040 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7041 # table to this one, replacing any in the intersection of the two.
7042
7043 sub standardize { return main::standardize($_[0]); }
7044 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7045
7046 {   # Closure
7047
7048     # This hash will contain as keys, all the aliases of all properties, and
7049     # as values, pointers to their respective property objects.  This allows
7050     # quick look-up of a property from any of its names.
7051     my %alias_to_property_of;
7052
7053     sub dump_alias_to_property_of {
7054         # For debugging
7055
7056         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7057         return;
7058     }
7059
7060     sub property_ref {
7061         # This is a package subroutine, not called as a method.
7062         # If the single parameter is a literal '*' it returns a list of all
7063         # defined properties.
7064         # Otherwise, the single parameter is a name, and it returns a pointer
7065         # to the corresponding property object, or undef if none.
7066         #
7067         # Properties can have several different names.  The 'standard' form of
7068         # each of them is stored in %alias_to_property_of as they are defined.
7069         # But it's possible that this subroutine will be called with some
7070         # variant, so if the initial lookup fails, it is repeated with the
7071         # standardized form of the input name.  If found, besides returning the
7072         # result, the input name is added to the list so future calls won't
7073         # have to do the conversion again.
7074
7075         my $name = shift;
7076
7077         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7078
7079         if (! defined $name) {
7080             Carp::my_carp_bug("Undefined input property.  No action taken.");
7081             return;
7082         }
7083
7084         return main::uniques(values %alias_to_property_of) if $name eq '*';
7085
7086         # Return cached result if have it.
7087         my $result = $alias_to_property_of{$name};
7088         return $result if defined $result;
7089
7090         # Convert the input to standard form.
7091         my $standard_name = standardize($name);
7092
7093         $result = $alias_to_property_of{$standard_name};
7094         return unless defined $result;        # Don't cache undefs
7095
7096         # Cache the result before returning it.
7097         $alias_to_property_of{$name} = $result;
7098         return $result;
7099     }
7100
7101
7102     main::setup_package();
7103
7104     my %map;
7105     # A pointer to the map table object for this property
7106     main::set_access('map', \%map);
7107
7108     my %full_name;
7109     # The property's full name.  This is a duplicate of the copy kept in the
7110     # map table, but is needed because stringify needs it during
7111     # construction of the map table, and then would have a chicken before egg
7112     # problem.
7113     main::set_access('full_name', \%full_name, 'r');
7114
7115     my %table_ref;
7116     # This hash will contain as keys, all the aliases of any match tables
7117     # attached to this property, and as values, the pointers to their
7118     # respective tables.  This allows quick look-up of a table from any of its
7119     # names.
7120     main::set_access('table_ref', \%table_ref);
7121
7122     my %type;
7123     # The type of the property, $ENUM, $BINARY, etc
7124     main::set_access('type', \%type, 'r');
7125
7126     my %file;
7127     # The filename where the map table will go (if actually written).
7128     # Normally defaulted, but can be overridden.
7129     main::set_access('file', \%file, 'r', 's');
7130
7131     my %directory;
7132     # The directory where the map table will go (if actually written).
7133     # Normally defaulted, but can be overridden.
7134     main::set_access('directory', \%directory, 's');
7135
7136     my %pseudo_map_type;
7137     # This is used to affect the calculation of the map types for all the
7138     # ranges in the table.  It should be set to one of the values that signify
7139     # to alter the calculation.
7140     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7141
7142     my %has_only_code_point_maps;
7143     # A boolean used to help in computing the type of data in the map table.
7144     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7145
7146     my %unique_maps;
7147     # A list of the first few distinct mappings this property has.  This is
7148     # used to disambiguate between binary and enum property types, so don't
7149     # have to keep more than three.
7150     main::set_access('unique_maps', \%unique_maps);
7151
7152     sub new {
7153         # The only required parameter is the positionally first, name.  All
7154         # other parameters are key => value pairs.  See the documentation just
7155         # above for the meanings of the ones not passed directly on to the map
7156         # table constructor.
7157
7158         my $class = shift;
7159         my $name = shift || "";
7160
7161         my $self = property_ref($name);
7162         if (defined $self) {
7163             my $options_string = join ", ", @_;
7164             $options_string = ".  Ignoring options $options_string" if $options_string;
7165             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
7166             return $self;
7167         }
7168
7169         my %args = @_;
7170
7171         $self = bless \do { my $anonymous_scalar }, $class;
7172         my $addr = do { no overloading; pack 'J', $self; };
7173
7174         $directory{$addr} = delete $args{'Directory'};
7175         $file{$addr} = delete $args{'File'};
7176         $full_name{$addr} = delete $args{'Full_Name'} || $name;
7177         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7178         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
7179         # Rest of parameters passed on.
7180
7181         $has_only_code_point_maps{$addr} = 1;
7182         $table_ref{$addr} = { };
7183         $unique_maps{$addr} = { };
7184
7185         $map{$addr} = Map_Table->new($name,
7186                                     Full_Name => $full_name{$addr},
7187                                     _Alias_Hash => \%alias_to_property_of,
7188                                     _Property => $self,
7189                                     %args);
7190         return $self;
7191     }
7192
7193     # See this program's beginning comment block about overloading the copy
7194     # constructor.  Few operations are defined on properties, but a couple are
7195     # useful.  It is safe to take the inverse of a property, and to remove a
7196     # single code point from it.
7197     use overload
7198         fallback => 0,
7199         qw("") => "_operator_stringify",
7200         "." => \&main::_operator_dot,
7201         '==' => \&main::_operator_equal,
7202         '!=' => \&main::_operator_not_equal,
7203         '=' => sub { return shift },
7204         '-=' => "_minus_and_equal",
7205     ;
7206
7207     sub _operator_stringify {
7208         return "Property '" .  shift->full_name . "'";
7209     }
7210
7211     sub _minus_and_equal {
7212         # Remove a single code point from the map table of a property.
7213
7214         my $self = shift;
7215         my $other = shift;
7216         my $reversed = shift;
7217         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7218
7219         if (ref $other) {
7220             Carp::my_carp_bug("Can't cope with a "
7221                         . ref($other)
7222                         . " argument to '-='.  Subtraction ignored.");
7223             return $self;
7224         }
7225         elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
7226             Carp::my_carp_bug("Can't cope with a "
7227             .  __PACKAGE__
7228             . " being the first parameter in a '-='.  Subtraction ignored.");
7229             return $self;
7230         }
7231         else {
7232             no overloading;
7233             $map{pack 'J', $self}->delete_range($other, $other);
7234         }
7235         return $self;
7236     }
7237
7238     sub add_match_table {
7239         # Add a new match table for this property, with name given by the
7240         # parameter.  It returns a pointer to the table.
7241
7242         my $self = shift;
7243         my $name = shift;
7244         my %args = @_;
7245
7246         my $addr = do { no overloading; pack 'J', $self; };
7247
7248         my $table = $table_ref{$addr}{$name};
7249         my $standard_name = main::standardize($name);
7250         if (defined $table
7251             || (defined ($table = $table_ref{$addr}{$standard_name})))
7252         {
7253             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
7254             $table_ref{$addr}{$name} = $table;
7255             return $table;
7256         }
7257         else {
7258
7259             # See if this is a perl extension, if not passed in.
7260             my $perl_extension = delete $args{'Perl_Extension'};
7261             $perl_extension
7262                         = $self->perl_extension if ! defined $perl_extension;
7263
7264             $table = Match_Table->new(
7265                                 Name => $name,
7266                                 Perl_Extension => $perl_extension,
7267                                 _Alias_Hash => $table_ref{$addr},
7268                                 _Property => $self,
7269
7270                                 # gets property's status by default
7271                                 Status => $self->status,
7272                                 _Status_Info => $self->status_info,
7273                                 %args,
7274                                 Internal_Only_Warning => 1); # Override any
7275                                                              # input param
7276             return unless defined $table;
7277         }
7278
7279         # Save the names for quick look up
7280         $table_ref{$addr}{$standard_name} = $table;
7281         $table_ref{$addr}{$name} = $table;
7282
7283         # Perhaps we can figure out the type of this property based on the
7284         # fact of adding this match table.  First, string properties don't
7285         # have match tables; second, a binary property can't have 3 match
7286         # tables
7287         if ($type{$addr} == $UNKNOWN) {
7288             $type{$addr} = $NON_STRING;
7289         }
7290         elsif ($type{$addr} == $STRING) {
7291             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
7292             $type{$addr} = $NON_STRING;
7293         }
7294         elsif ($type{$addr} != $ENUM) {
7295             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7296                 && $type{$addr} == $BINARY)
7297             {
7298                 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
7299                 $type{$addr} = $ENUM;
7300             }
7301         }
7302
7303         return $table;
7304     }
7305
7306     sub table {
7307         # Return a pointer to the match table (with name given by the
7308         # parameter) associated with this property; undef if none.
7309
7310         my $self = shift;
7311         my $name = shift;
7312         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7313
7314         my $addr = do { no overloading; pack 'J', $self; };
7315
7316         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7317
7318         # If quick look-up failed, try again using the standard form of the
7319         # input name.  If that succeeds, cache the result before returning so
7320         # won't have to standardize this input name again.
7321         my $standard_name = main::standardize($name);
7322         return unless defined $table_ref{$addr}{$standard_name};
7323
7324         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7325         return $table_ref{$addr}{$name};
7326     }
7327
7328     sub tables {
7329         # Return a list of pointers to all the match tables attached to this
7330         # property
7331
7332         no overloading;
7333         return main::uniques(values %{$table_ref{pack 'J', shift}});
7334     }
7335
7336     sub directory {
7337         # Returns the directory the map table for this property should be
7338         # output in.  If a specific directory has been specified, that has
7339         # priority;  'undef' is returned if the type isn't defined;
7340         # or $map_directory for everything else.
7341
7342         my $addr = do { no overloading; pack 'J', shift; };
7343
7344         return $directory{$addr} if defined $directory{$addr};
7345         return undef if $type{$addr} == $UNKNOWN;
7346         return $map_directory;
7347     }
7348
7349     sub swash_name {
7350         # Return the name that is used to both:
7351         #   1)  Name the file that the map table is written to.
7352         #   2)  The name of swash related stuff inside that file.
7353         # The reason for this is that the Perl core historically has used
7354         # certain names that aren't the same as the Unicode property names.
7355         # To continue using these, $file is hard-coded in this file for those,
7356         # but otherwise the standard name is used.  This is different from the
7357         # external_name, so that the rest of the files, like in lib can use
7358         # the standard name always, without regard to historical precedent.
7359
7360         my $self = shift;
7361         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7362
7363         my $addr = do { no overloading; pack 'J', $self; };
7364
7365         return $file{$addr} if defined $file{$addr};
7366         return $map{$addr}->external_name;
7367     }
7368
7369     sub to_create_match_tables {
7370         # Returns a boolean as to whether or not match tables should be
7371         # created for this property.
7372
7373         my $self = shift;
7374         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7375
7376         # The whole point of this pseudo property is match tables.
7377         return 1 if $self == $perl;
7378
7379         my $addr = do { no overloading; pack 'J', $self; };
7380
7381         # Don't generate tables of code points that match the property values
7382         # of a string property.  Such a list would most likely have many
7383         # property values, each with just one or very few code points mapping
7384         # to it.
7385         return 0 if $type{$addr} == $STRING;
7386
7387         # Don't generate anything for unimplemented properties.
7388         return 0 if grep { $self->complete_name eq $_ }
7389                                                     @unimplemented_properties;
7390         # Otherwise, do.
7391         return 1;
7392     }
7393
7394     sub property_add_or_replace_non_nulls {
7395         # This adds the mappings in the property $other to $self.  Non-null
7396         # mappings from $other override those in $self.  It essentially merges
7397         # the two properties, with the second having priority except for null
7398         # mappings.
7399
7400         my $self = shift;
7401         my $other = shift;
7402         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7403
7404         if (! $other->isa(__PACKAGE__)) {
7405             Carp::my_carp_bug("$other should be a "
7406                             . __PACKAGE__
7407                             . ".  Not a '"
7408                             . ref($other)
7409                             . "'.  Not added;");
7410             return;
7411         }
7412
7413         no overloading;
7414         return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
7415     }
7416
7417     sub set_type {
7418         # Set the type of the property.  Mostly this is figured out by the
7419         # data in the table.  But this is used to set it explicitly.  The
7420         # reason it is not a standard accessor is that when setting a binary
7421         # property, we need to make sure that all the true/false aliases are
7422         # present, as they were omitted in early Unicode releases.
7423
7424         my $self = shift;
7425         my $type = shift;
7426         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7427
7428         if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
7429             Carp::my_carp("Unrecognized type '$type'.  Type not set");
7430             return;
7431         }
7432
7433         { no overloading; $type{pack 'J', $self} = $type; }
7434         return if $type != $BINARY;
7435
7436         my $yes = $self->table('Y');
7437         $yes = $self->table('Yes') if ! defined $yes;
7438         $yes = $self->add_match_table('Y') if ! defined $yes;
7439         $yes->add_alias('Yes');
7440         $yes->add_alias('T');
7441         $yes->add_alias('True');
7442
7443         my $no = $self->table('N');
7444         $no = $self->table('No') if ! defined $no;
7445         $no = $self->add_match_table('N') if ! defined $no;
7446         $no->add_alias('No');
7447         $no->add_alias('F');
7448         $no->add_alias('False');
7449         return;
7450     }
7451
7452     sub add_map {
7453         # Add a map to the property's map table.  This also keeps
7454         # track of the maps so that the property type can be determined from
7455         # its data.
7456
7457         my $self = shift;
7458         my $start = shift;  # First code point in range
7459         my $end = shift;    # Final code point in range
7460         my $map = shift;    # What the range maps to.
7461         # Rest of parameters passed on.
7462
7463         my $addr = do { no overloading; pack 'J', $self; };
7464
7465         # If haven't the type of the property, gather information to figure it
7466         # out.
7467         if ($type{$addr} == $UNKNOWN) {
7468
7469             # If the map contains an interior blank or dash, or most other
7470             # nonword characters, it will be a string property.  This
7471             # heuristic may actually miss some string properties.  If so, they
7472             # may need to have explicit set_types called for them.  This
7473             # happens in the Unihan properties.
7474             if ($map =~ / (?<= . ) [ -] (?= . ) /x
7475                 || $map =~ / [^\w.\/\ -]  /x)
7476             {
7477                 $self->set_type($STRING);
7478
7479                 # $unique_maps is used for disambiguating between ENUM and
7480                 # BINARY later; since we know the property is not going to be
7481                 # one of those, no point in keeping the data around
7482                 undef $unique_maps{$addr};
7483             }
7484             else {
7485
7486                 # Not necessarily a string.  The final decision has to be
7487                 # deferred until all the data are in.  We keep track of if all
7488                 # the values are code points for that eventual decision.
7489                 $has_only_code_point_maps{$addr} &=
7490                                             $map =~ / ^ $code_point_re $/x;
7491
7492                 # For the purposes of disambiguating between binary and other
7493                 # enumerations at the end, we keep track of the first three
7494                 # distinct property values.  Once we get to three, we know
7495                 # it's not going to be binary, so no need to track more.
7496                 if (scalar keys %{$unique_maps{$addr}} < 3) {
7497                     $unique_maps{$addr}{main::standardize($map)} = 1;
7498                 }
7499             }
7500         }
7501
7502         # Add the mapping by calling our map table's method
7503         return $map{$addr}->add_map($start, $end, $map, @_);
7504     }
7505
7506     sub compute_type {
7507         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
7508         # should be called after the property is mostly filled with its maps.
7509         # We have been keeping track of what the property values have been,
7510         # and now have the necessary information to figure out the type.
7511
7512         my $self = shift;
7513         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7514
7515         my $addr = do { no overloading; pack 'J', $self; };
7516
7517         my $type = $type{$addr};
7518
7519         # If already have figured these out, no need to do so again, but we do
7520         # a double check on ENUMS to make sure that a string property hasn't
7521         # improperly been classified as an ENUM, so continue on with those.
7522         return if $type == $STRING || $type == $BINARY;
7523
7524         # If every map is to a code point, is a string property.
7525         if ($type == $UNKNOWN
7526             && ($has_only_code_point_maps{$addr}
7527                 || (defined $map{$addr}->default_map
7528                     && $map{$addr}->default_map eq "")))
7529         {
7530             $self->set_type($STRING);
7531         }
7532         else {
7533
7534             # Otherwise, it is to some sort of enumeration.  (The case where
7535             # it is a Unicode miscellaneous property, and treated like a
7536             # string in this program is handled in add_map()).  Distinguish
7537             # between binary and some other enumeration type.  Of course, if
7538             # there are more than two values, it's not binary.  But more
7539             # subtle is the test that the default mapping is defined means it
7540             # isn't binary.  This in fact may change in the future if Unicode
7541             # changes the way its data is structured.  But so far, no binary
7542             # properties ever have @missing lines for them, so the default map
7543             # isn't defined for them.  The few properties that are two-valued
7544             # and aren't considered binary have the default map defined
7545             # starting in Unicode 5.0, when the @missing lines appeared; and
7546             # this program has special code to put in a default map for them
7547             # for earlier than 5.0 releases.
7548             if ($type == $ENUM
7549                 || scalar keys %{$unique_maps{$addr}} > 2
7550                 || defined $self->default_map)
7551             {
7552                 my $tables = $self->tables;
7553                 my $count = $self->count;
7554                 if ($verbosity && $count > 500 && $tables/$count > .1) {
7555                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7556                 }
7557                 $self->set_type($ENUM);
7558             }
7559             else {
7560                 $self->set_type($BINARY);
7561             }
7562         }
7563         undef $unique_maps{$addr};  # Garbage collect
7564         return;
7565     }
7566
7567     # Most of the accessors for a property actually apply to its map table.
7568     # Setup up accessor functions for those, referring to %map
7569     for my $sub (qw(
7570                     add_alias
7571                     add_anomalous_entry
7572                     add_comment
7573                     add_conflicting
7574                     add_description
7575                     add_duplicate
7576                     add_note
7577                     aliases
7578                     comment
7579                     complete_name
7580                     containing_range
7581                     core_access
7582                     count
7583                     default_map
7584                     delete_range
7585                     description
7586                     each_range
7587                     external_name
7588                     file_path
7589                     format
7590                     initialize
7591                     inverse_list
7592                     is_empty
7593                     name
7594                     note
7595                     perl_extension
7596                     property
7597                     range_count
7598                     ranges
7599                     range_size_1
7600                     reset_each_range
7601                     set_comment
7602                     set_core_access
7603                     set_default_map
7604                     set_file_path
7605                     set_final_comment
7606                     set_range_size_1
7607                     set_status
7608                     set_to_output_map
7609                     short_name
7610                     status
7611                     status_info
7612                     to_output_map
7613                     type_of
7614                     value_of
7615                     write
7616                 ))
7617                     # 'property' above is for symmetry, so that one can take
7618                     # the property of a property and get itself, and so don't
7619                     # have to distinguish between properties and tables in
7620                     # calling code
7621     {
7622         no strict "refs";
7623         *$sub = sub {
7624             use strict "refs";
7625             my $self = shift;
7626             no overloading;
7627             return $map{pack 'J', $self}->$sub(@_);
7628         }
7629     }
7630
7631
7632 } # End closure
7633
7634 package main;
7635
7636 sub join_lines($) {
7637     # Returns lines of the input joined together, so that they can be folded
7638     # properly.
7639     # This causes continuation lines to be joined together into one long line
7640     # for folding.  A continuation line is any line that doesn't begin with a
7641     # space or "\b" (the latter is stripped from the output).  This is so
7642     # lines can be be in a HERE document so as to fit nicely in the terminal
7643     # width, but be joined together in one long line, and then folded with
7644     # indents, '#' prefixes, etc, properly handled.
7645     # A blank separates the joined lines except if there is a break; an extra
7646     # blank is inserted after a period ending a line.
7647
7648     # Initialize the return with the first line.
7649     my ($return, @lines) = split "\n", shift;
7650
7651     # If the first line is null, it was an empty line, add the \n back in
7652     $return = "\n" if $return eq "";
7653
7654     # Now join the remainder of the physical lines.
7655     for my $line (@lines) {
7656
7657         # An empty line means wanted a blank line, so add two \n's to get that
7658         # effect, and go to the next line.
7659         if (length $line == 0) {
7660             $return .= "\n\n";
7661             next;
7662         }
7663
7664         # Look at the last character of what we have so far.
7665         my $previous_char = substr($return, -1, 1);
7666
7667         # And at the next char to be output.
7668         my $next_char = substr($line, 0, 1);
7669
7670         if ($previous_char ne "\n") {
7671
7672             # Here didn't end wth a nl.  If the next char a blank or \b, it
7673             # means that here there is a break anyway.  So add a nl to the
7674             # output.
7675             if ($next_char eq " " || $next_char eq "\b") {
7676                 $previous_char = "\n";
7677                 $return .= $previous_char;
7678             }
7679
7680             # Add an extra space after periods.
7681             $return .= " " if $previous_char eq '.';
7682         }
7683
7684         # Here $previous_char is still the latest character to be output.  If
7685         # it isn't a nl, it means that the next line is to be a continuation
7686         # line, with a blank inserted between them.
7687         $return .= " " if $previous_char ne "\n";
7688
7689         # Get rid of any \b
7690         substr($line, 0, 1) = "" if $next_char eq "\b";
7691
7692         # And append this next line.
7693         $return .= $line;
7694     }
7695
7696     return $return;
7697 }
7698
7699 sub simple_fold($;$$$) {
7700     # Returns a string of the input (string or an array of strings) folded
7701     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7702     # a \n
7703     # This is tailored for the kind of text written by this program,
7704     # especially the pod file, which can have very long names with
7705     # underscores in the middle, or words like AbcDefgHij....  We allow
7706     # breaking in the middle of such constructs if the line won't fit
7707     # otherwise.  The break in such cases will come either just after an
7708     # underscore, or just before one of the Capital letters.
7709
7710     local $to_trace = 0 if main::DEBUG;
7711
7712     my $line = shift;
7713     my $prefix = shift;     # Optional string to prepend to each output
7714                             # line
7715     $prefix = "" unless defined $prefix;
7716
7717     my $hanging_indent = shift; # Optional number of spaces to indent
7718                                 # continuation lines
7719     $hanging_indent = 0 unless $hanging_indent;
7720
7721     my $right_margin = shift;   # Optional number of spaces to narrow the
7722                                 # total width by.
7723     $right_margin = 0 unless defined $right_margin;
7724
7725     # Call carp with the 'nofold' option to avoid it from trying to call us
7726     # recursively
7727     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7728
7729     # The space available doesn't include what's automatically prepended
7730     # to each line, or what's reserved on the right.
7731     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7732     # XXX Instead of using the 'nofold' perhaps better to look up the stack
7733
7734     if (DEBUG && $hanging_indent >= $max) {
7735         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
7736         $hanging_indent = 0;
7737     }
7738
7739     # First, split into the current physical lines.
7740     my @line;
7741     if (ref $line) {        # Better be an array, because not bothering to
7742                             # test
7743         foreach my $line (@{$line}) {
7744             push @line, split /\n/, $line;
7745         }
7746     }
7747     else {
7748         @line = split /\n/, $line;
7749     }
7750
7751     #local $to_trace = 1 if main::DEBUG;
7752     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7753
7754     # Look at each current physical line.
7755     for (my $i = 0; $i < @line; $i++) {
7756         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7757         #local $to_trace = 1 if main::DEBUG;
7758         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7759
7760         # Remove prefix, because will be added back anyway, don't want
7761         # doubled prefix
7762         $line[$i] =~ s/^$prefix//;
7763
7764         # Remove trailing space
7765         $line[$i] =~ s/\s+\Z//;
7766
7767         # If the line is too long, fold it.
7768         if (length $line[$i] > $max) {
7769             my $remainder;
7770
7771             # Here needs to fold.  Save the leading space in the line for
7772             # later.
7773             $line[$i] =~ /^ ( \s* )/x;
7774             my $leading_space = $1;
7775             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7776
7777             # If character at final permissible position is white space,
7778             # fold there, which will delete that white space
7779             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7780                 $remainder = substr($line[$i], $max);
7781                 $line[$i] = substr($line[$i], 0, $max - 1);
7782             }
7783             else {
7784
7785                 # Otherwise fold at an acceptable break char closest to
7786                 # the max length.  Look at just the maximal initial
7787                 # segment of the line
7788                 my $segment = substr($line[$i], 0, $max - 1);
7789                 if ($segment =~
7790                     /^ ( .{$hanging_indent}   # Don't look before the
7791                                               #  indent.
7792                         \ *                   # Don't look in leading
7793                                               #  blanks past the indent
7794                             [^ ] .*           # Find the right-most
7795                         (?:                   #  acceptable break:
7796                             [ \s = ]          # space or equal
7797                             | - (?! [.0-9] )  # or non-unary minus.
7798                         )                     # $1 includes the character
7799                     )/x)
7800                 {
7801                     # Split into the initial part that fits, and remaining
7802                     # part of the input
7803                     $remainder = substr($line[$i], length $1);
7804                     $line[$i] = $1;
7805                     trace $line[$i] if DEBUG && $to_trace;
7806                     trace $remainder if DEBUG && $to_trace;
7807                 }
7808
7809                 # If didn't find a good breaking spot, see if there is a
7810                 # not-so-good breaking spot.  These are just after
7811                 # underscores or where the case changes from lower to
7812                 # upper.  Use \a as a soft hyphen, but give up
7813                 # and don't break the line if there is actually a \a
7814                 # already in the input.  We use an ascii character for the
7815                 # soft-hyphen to avoid any attempt by miniperl to try to
7816                 # access the files that this program is creating.
7817                 elsif ($segment !~ /\a/
7818                        && ($segment =~ s/_/_\a/g
7819                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
7820                 {
7821                     # Here were able to find at least one place to insert
7822                     # our substitute soft hyphen.  Find the right-most one
7823                     # and replace it by a real hyphen.
7824                     trace $segment if DEBUG && $to_trace;
7825                     substr($segment,
7826                             rindex($segment, "\a"),
7827                             1) = '-';
7828
7829                     # Then remove the soft hyphen substitutes.
7830                     $segment =~ s/\a//g;
7831                     trace $segment if DEBUG && $to_trace;
7832
7833                     # And split into the initial part that fits, and
7834                     # remainder of the line
7835                     my $pos = rindex($segment, '-');
7836                     $remainder = substr($line[$i], $pos);
7837                     trace $remainder if DEBUG && $to_trace;
7838                     $line[$i] = substr($segment, 0, $pos + 1);
7839                 }
7840             }
7841
7842             # Here we know if we can fold or not.  If we can, $remainder
7843             # is what remains to be processed in the next iteration.
7844             if (defined $remainder) {
7845                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
7846
7847                 # Insert the folded remainder of the line as a new element
7848                 # of the array.  (It may still be too long, but we will
7849                 # deal with that next time through the loop.)  Omit any
7850                 # leading space in the remainder.
7851                 $remainder =~ s/^\s+//;
7852                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
7853
7854                 # But then indent by whichever is larger of:
7855                 # 1) the leading space on the input line;
7856                 # 2) the hanging indent.
7857                 # This preserves indentation in the original line.
7858                 my $lead = ($leading_space)
7859                             ? length $leading_space
7860                             : $hanging_indent;
7861                 $lead = max($lead, $hanging_indent);
7862                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
7863             }
7864         }
7865
7866         # Ready to output the line. Get rid of any trailing space
7867         # And prefix by the required $prefix passed in.
7868         $line[$i] =~ s/\s+$//;
7869         $line[$i] = "$prefix$line[$i]\n";
7870     } # End of looping through all the lines.
7871
7872     return join "", @line;
7873 }
7874
7875 sub property_ref {  # Returns a reference to a property object.
7876     return Property::property_ref(@_);
7877 }
7878
7879 sub force_unlink ($) {
7880     my $filename = shift;
7881     return unless file_exists($filename);
7882     return if CORE::unlink($filename);
7883
7884     # We might need write permission
7885     chmod 0777, $filename;
7886     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
7887     return;
7888 }
7889
7890 sub write ($$@) {
7891     # Given a filename and references to arrays of lines, write the lines of
7892     # each array to the file
7893     # Filename can be given as an arrayref of directory names
7894
7895     return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
7896
7897     my $file  = shift;
7898     my $use_utf8 = shift;
7899
7900     # Get into a single string if an array, and get rid of, in Unix terms, any
7901     # leading '.'
7902     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
7903     $file = File::Spec->canonpath($file);
7904
7905     # If has directories, make sure that they all exist
7906     (undef, my $directories, undef) = File::Spec->splitpath($file);
7907     File::Path::mkpath($directories) if $directories && ! -d $directories;
7908
7909     push @files_actually_output, $file;
7910
7911     force_unlink ($file);
7912
7913     my $OUT;
7914     if (not open $OUT, ">", $file) {
7915         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
7916         return;
7917     }
7918
7919     binmode $OUT, ":utf8" if $use_utf8;
7920
7921     while (defined (my $lines_ref = shift)) {
7922         unless (@$lines_ref) {
7923             Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
7924         }
7925
7926         print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
7927     }
7928     close $OUT or die Carp::my_carp("close '$file' failed: $!");
7929
7930     print "$file written.\n" if $verbosity >= $VERBOSE;
7931
7932     return;
7933 }
7934
7935
7936 sub Standardize($) {
7937     # This converts the input name string into a standardized equivalent to
7938     # use internally.
7939
7940     my $name = shift;
7941     unless (defined $name) {
7942       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
7943       return;
7944     }
7945
7946     # Remove any leading or trailing white space
7947     $name =~ s/^\s+//g;
7948     $name =~ s/\s+$//g;
7949
7950     # Convert interior white space and hyphens into underscores.
7951     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
7952
7953     # Capitalize the letter following an underscore, and convert a sequence of
7954     # multiple underscores to a single one
7955     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
7956
7957     # And capitalize the first letter, but not for the special cjk ones.
7958     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
7959     return $name;
7960 }
7961
7962 sub standardize ($) {
7963     # Returns a lower-cased standardized name, without underscores.  This form
7964     # is chosen so that it can distinguish between any real versus superficial
7965     # Unicode name differences.  It relies on the fact that Unicode doesn't
7966     # have interior underscores, white space, nor dashes in any
7967     # stricter-matched name.  It should not be used on Unicode code point
7968     # names (the Name property), as they mostly, but not always follow these
7969     # rules.
7970
7971     my $name = Standardize(shift);
7972     return if !defined $name;
7973
7974     $name =~ s/ (?<= .) _ (?= . ) //xg;
7975     return lc $name;
7976 }
7977
7978 {   # Closure
7979
7980     my $indent_increment = " " x 2;
7981     my %already_output;
7982
7983     $main::simple_dumper_nesting = 0;
7984
7985     sub simple_dumper {
7986         # Like Simple Data::Dumper. Good enough for our needs. We can't use
7987         # the real thing as we have to run under miniperl.
7988
7989         # It is designed so that on input it is at the beginning of a line,
7990         # and the final thing output in any call is a trailing ",\n".
7991
7992         my $item = shift;
7993         my $indent = shift;
7994         $indent = "" if ! defined $indent;
7995
7996         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7997
7998         # nesting level is localized, so that as the call stack pops, it goes
7999         # back to the prior value.
8000         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8001         undef %already_output if $main::simple_dumper_nesting == 0;
8002         $main::simple_dumper_nesting++;
8003         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8004
8005         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8006
8007         # Determine the indent for recursive calls.
8008         my $next_indent = $indent . $indent_increment;
8009
8010         my $output;
8011         if (! ref $item) {
8012
8013             # Dump of scalar: just output it in quotes if not a number.  To do
8014             # so we must escape certain characters, and therefore need to
8015             # operate on a copy to avoid changing the original
8016             my $copy = $item;
8017             $copy = $UNDEF unless defined $copy;
8018
8019             # Quote non-numbers (numbers also have optional leading '-' and
8020             # fractions)
8021             if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
8022
8023                 # Escape apostrophe and backslash
8024                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8025                 $copy = "'$copy'";
8026             }
8027             $output = "$indent$copy,\n";
8028         }
8029         else {
8030
8031             # Keep track of cycles in the input, and refuse to infinitely loop
8032             my $addr = do { no overloading; pack 'J', $item; };
8033             if (defined $already_output{$addr}) {
8034                 return "${indent}ALREADY OUTPUT: $item\n";
8035             }
8036             $already_output{$addr} = $item;
8037
8038             if (ref $item eq 'ARRAY') {
8039                 my $using_brackets;
8040                 $output = $indent;
8041                 if ($main::simple_dumper_nesting > 1) {
8042                     $output .= '[';
8043                     $using_brackets = 1;
8044                 }
8045                 else {
8046                     $using_brackets = 0;
8047                 }
8048
8049                 # If the array is empty, put the closing bracket on the same
8050                 # line.  Otherwise, recursively add each array element
8051                 if (@$item == 0) {
8052                     $output .= " ";
8053                 }
8054                 else {
8055                     $output .= "\n";
8056                     for (my $i = 0; $i < @$item; $i++) {
8057
8058                         # Indent array elements one level
8059                         $output .= &simple_dumper($item->[$i], $next_indent);
8060                         $output =~ s/\n$//;      # Remove trailing nl so as to
8061                         $output .= " # [$i]\n";  # add a comment giving the
8062                                                  # array index
8063                     }
8064                     $output .= $indent;     # Indent closing ']' to orig level
8065                 }
8066                 $output .= ']' if $using_brackets;
8067                 $output .= ",\n";
8068             }
8069             elsif (ref $item eq 'HASH') {
8070                 my $is_first_line;
8071                 my $using_braces;
8072                 my $body_indent;
8073
8074                 # No surrounding braces at top level
8075                 $output .= $indent;
8076                 if ($main::simple_dumper_nesting > 1) {
8077                     $output .= "{\n";
8078                     $is_first_line = 0;
8079                     $body_indent = $next_indent;
8080                     $next_indent .= $indent_increment;
8081                     $using_braces = 1;
8082                 }
8083                 else {
8084                     $is_first_line = 1;
8085                     $body_indent = $indent;
8086                     $using_braces = 0;
8087                 }
8088
8089                 # Output hashes sorted alphabetically instead of apparently
8090                 # random.  Use caseless alphabetic sort
8091                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8092                 {
8093                     if ($is_first_line) {
8094                         $is_first_line = 0;
8095                     }
8096                     else {
8097                         $output .= "$body_indent";
8098                     }
8099
8100                     # The key must be a scalar, but this recursive call quotes
8101                     # it
8102                     $output .= &simple_dumper($key);
8103
8104                     # And change the trailing comma and nl to the hash fat
8105                     # comma for clarity, and so the value can be on the same
8106                     # line
8107                     $output =~ s/,\n$/ => /;
8108
8109                     # Recursively call to get the value's dump.
8110                     my $next = &simple_dumper($item->{$key}, $next_indent);
8111
8112                     # If the value is all on one line, remove its indent, so
8113                     # will follow the => immediately.  If it takes more than
8114                     # one line, start it on a new line.
8115                     if ($next !~ /\n.*\n/) {
8116                         $next =~ s/^ *//;
8117                     }
8118                     else {
8119                         $output .= "\n";
8120                     }
8121                     $output .= $next;
8122                 }
8123
8124                 $output .= "$indent},\n" if $using_braces;
8125             }
8126             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8127                 $output = $indent . ref($item) . "\n";
8128                 # XXX see if blessed
8129             }
8130             elsif ($item->can('dump')) {
8131
8132                 # By convention in this program, objects furnish a 'dump'
8133                 # method.  Since not doing any output at this level, just pass
8134                 # on the input indent
8135                 $output = $item->dump($indent);
8136             }
8137             else {
8138                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
8139             }
8140         }
8141         return $output;
8142     }
8143 }
8144
8145 sub dump_inside_out {
8146     # Dump inside-out hashes in an object's state by converting them to a
8147     # regular hash and then calling simple_dumper on that.
8148
8149     my $object = shift;
8150     my $fields_ref = shift;
8151     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8152
8153     my $addr = do { no overloading; pack 'J', $object; };
8154
8155     my %hash;
8156     foreach my $key (keys %$fields_ref) {
8157         $hash{$key} = $fields_ref->{$key}{$addr};
8158     }
8159
8160     return simple_dumper(\%hash, @_);
8161 }
8162
8163 sub _operator_dot {
8164     # Overloaded '.' method that is common to all packages.  It uses the
8165     # package's stringify method.
8166
8167     my $self = shift;
8168     my $other = shift;
8169     my $reversed = shift;
8170     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8171
8172     $other = "" unless defined $other;
8173
8174     foreach my $which (\$self, \$other) {
8175         next unless ref $$which;
8176         if ($$which->can('_operator_stringify')) {
8177             $$which = $$which->_operator_stringify;
8178         }
8179         else {
8180             my $ref = ref $$which;
8181             my $addr = do { no overloading; pack 'J', $$which; };
8182             $$which = "$ref ($addr)";
8183         }
8184     }
8185     return ($reversed)
8186             ? "$other$self"
8187             : "$self$other";
8188 }
8189
8190 sub _operator_equal {
8191     # Generic overloaded '==' routine.  To be equal, they must be the exact
8192     # same object
8193
8194     my $self = shift;
8195     my $other = shift;
8196
8197     return 0 unless defined $other;
8198     return 0 unless ref $other;
8199     no overloading;
8200     return $self == $other;
8201 }
8202
8203 sub _operator_not_equal {
8204     my $self = shift;
8205     my $other = shift;
8206
8207     return ! _operator_equal($self, $other);
8208 }
8209
8210 sub process_PropertyAliases($) {
8211     # This reads in the PropertyAliases.txt file, which contains almost all
8212     # the character properties in Unicode and their equivalent aliases:
8213     # scf       ; Simple_Case_Folding         ; sfc
8214     #
8215     # Field 0 is the preferred short name for the property.
8216     # Field 1 is the full name.
8217     # Any succeeding ones are other accepted names.
8218
8219     my $file= shift;
8220     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8221
8222     # This whole file was non-existent in early releases, so use our own
8223     # internal one.
8224     $file->insert_lines(get_old_property_aliases())
8225                                                 if ! -e 'PropertyAliases.txt';
8226
8227     # Add any cjk properties that may have been defined.
8228     $file->insert_lines(@cjk_properties);
8229
8230     while ($file->next_line) {
8231
8232         my @data = split /\s*;\s*/;
8233
8234         my $full = $data[1];
8235
8236         my $this = Property->new($data[0], Full_Name => $full);
8237
8238         # Start looking for more aliases after these two.
8239         for my $i (2 .. @data - 1) {
8240             $this->add_alias($data[$i]);
8241         }
8242
8243     }
8244     return;
8245 }
8246
8247 sub finish_property_setup {
8248     # Finishes setting up after PropertyAliases.
8249
8250     my $file = shift;
8251     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8252
8253     # This entry was missing from this file in earlier Unicode versions
8254     if (-e 'Jamo.txt') {
8255         my $jsn = property_ref('JSN');
8256         if (! defined $jsn) {
8257             $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8258         }
8259     }
8260
8261     # This entry is still missing as of 6.0, perhaps because no short name for
8262     # it.
8263     if (-e 'NameAliases.txt') {
8264         my $aliases = property_ref('Name_Alias');
8265         if (! defined $aliases) {
8266             $aliases = Property->new('Name_Alias');
8267         }
8268     }
8269
8270     # These are used so much, that we set globals for them.
8271     $gc = property_ref('General_Category');
8272     $block = property_ref('Block');
8273
8274     # Perl adds this alias.
8275     $gc->add_alias('Category');
8276
8277     # For backwards compatibility, these property files have particular names.
8278     my $upper = property_ref('Uppercase_Mapping');
8279     $upper->set_core_access('uc()');
8280     $upper->set_file('Upper'); # This is what utf8.c calls it
8281
8282     my $lower = property_ref('Lowercase_Mapping');
8283     $lower->set_core_access('lc()');
8284     $lower->set_file('Lower');
8285
8286     my $title = property_ref('Titlecase_Mapping');
8287     $title->set_core_access('ucfirst()');
8288     $title->set_file('Title');
8289
8290     my $fold = property_ref('Case_Folding');
8291     $fold->set_file('Fold') if defined $fold;
8292
8293     # utf8.c has a different meaning for non range-size-1 for map properties
8294     # that this program doesn't currently handle; and even if it were changed
8295     # to do so, some other code may be using them expecting range size 1.
8296     foreach my $property (qw {
8297                                 Case_Folding
8298                                 Lowercase_Mapping
8299                                 Titlecase_Mapping
8300                                 Uppercase_Mapping
8301                             })
8302     {
8303         property_ref($property)->set_range_size_1(1);
8304     }
8305
8306     # These two properties aren't actually used in the core, but unfortunately
8307     # the names just above that are in the core interfere with these, so
8308     # choose different names.  These aren't a problem unless the map tables
8309     # for these files get written out.
8310     my $lowercase = property_ref('Lowercase');
8311     $lowercase->set_file('IsLower') if defined $lowercase;
8312     my $uppercase = property_ref('Uppercase');
8313     $uppercase->set_file('IsUpper') if defined $uppercase;
8314
8315     # Set up the hard-coded default mappings, but only on properties defined
8316     # for this release
8317     foreach my $property (keys %default_mapping) {
8318         my $property_object = property_ref($property);
8319         next if ! defined $property_object;
8320         my $default_map = $default_mapping{$property};
8321         $property_object->set_default_map($default_map);
8322
8323         # A map of <code point> implies the property is string.
8324         if ($property_object->type == $UNKNOWN
8325             && $default_map eq $CODE_POINT)
8326         {
8327             $property_object->set_type($STRING);
8328         }
8329     }
8330
8331     # The following use the Multi_Default class to create objects for
8332     # defaults.
8333
8334     # Bidi class has a complicated default, but the derived file takes care of
8335     # the complications, leaving just 'L'.
8336     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8337         property_ref('Bidi_Class')->set_default_map('L');
8338     }
8339     else {
8340         my $default;
8341
8342         # The derived file was introduced in 3.1.1.  The values below are
8343         # taken from table 3-8, TUS 3.0
8344         my $default_R =
8345             'my $default = Range_List->new;
8346              $default->add_range(0x0590, 0x05FF);
8347              $default->add_range(0xFB1D, 0xFB4F);'
8348         ;
8349
8350         # The defaults apply only to unassigned characters
8351         $default_R .= '$gc->table("Unassigned") & $default;';
8352
8353         if ($v_version lt v3.0.0) {
8354             $default = Multi_Default->new(R => $default_R, 'L');
8355         }
8356         else {
8357
8358             # AL apparently not introduced until 3.0:  TUS 2.x references are
8359             # not on-line to check it out
8360             my $default_AL =
8361                 'my $default = Range_List->new;
8362                  $default->add_range(0x0600, 0x07BF);
8363                  $default->add_range(0xFB50, 0xFDFF);
8364                  $default->add_range(0xFE70, 0xFEFF);'
8365             ;
8366
8367             # Non-character code points introduced in this release; aren't AL
8368             if ($v_version ge 3.1.0) {
8369                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8370             }
8371             $default_AL .= '$gc->table("Unassigned") & $default';
8372             $default = Multi_Default->new(AL => $default_AL,
8373                                           R => $default_R,
8374                                           'L');
8375         }
8376         property_ref('Bidi_Class')->set_default_map($default);
8377     }
8378
8379     # Joining type has a complicated default, but the derived file takes care
8380     # of the complications, leaving just 'U' (or Non_Joining), except the file
8381     # is bad in 3.1.0
8382     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8383         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8384             property_ref('Joining_Type')->set_default_map('Non_Joining');
8385         }
8386         else {
8387
8388             # Otherwise, there are not one, but two possibilities for the
8389             # missing defaults: T and U.
8390             # The missing defaults that evaluate to T are given by:
8391             # T = Mn + Cf - ZWNJ - ZWJ
8392             # where Mn and Cf are the general category values. In other words,
8393             # any non-spacing mark or any format control character, except
8394             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8395             # WIDTH JOINER (joining type C).
8396             my $default = Multi_Default->new(
8397                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8398                'Non_Joining');
8399             property_ref('Joining_Type')->set_default_map($default);
8400         }
8401     }
8402
8403     # Line break has a complicated default in early releases. It is 'Unknown'
8404     # for non-assigned code points; 'AL' for assigned.
8405     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8406         my $lb = property_ref('Line_Break');
8407         if ($v_version gt 3.2.0) {
8408             $lb->set_default_map('Unknown');
8409         }
8410         else {
8411             my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8412                                               'AL');
8413             $lb->set_default_map($default);
8414         }
8415
8416         # If has the URS property, make sure that the standard aliases are in
8417         # it, since not in the input tables in some versions.
8418         my $urs = property_ref('Unicode_Radical_Stroke');
8419         if (defined $urs) {
8420             $urs->add_alias('cjkRSUnicode');
8421             $urs->add_alias('kRSUnicode');
8422         }
8423     }
8424     return;
8425 }
8426
8427 sub get_old_property_aliases() {
8428     # Returns what would be in PropertyAliases.txt if it existed in very old
8429     # versions of Unicode.  It was derived from the one in 3.2, and pared
8430     # down based on the data that was actually in the older releases.
8431     # An attempt was made to use the existence of files to mean inclusion or
8432     # not of various aliases, but if this was not sufficient, using version
8433     # numbers was resorted to.
8434
8435     my @return;
8436
8437     # These are to be used in all versions (though some are constructed by
8438     # this program if missing)
8439     push @return, split /\n/, <<'END';
8440 bc        ; Bidi_Class
8441 Bidi_M    ; Bidi_Mirrored
8442 cf        ; Case_Folding
8443 ccc       ; Canonical_Combining_Class
8444 dm        ; Decomposition_Mapping
8445 dt        ; Decomposition_Type
8446 gc        ; General_Category
8447 isc       ; ISO_Comment
8448 lc        ; Lowercase_Mapping
8449 na        ; Name
8450 na1       ; Unicode_1_Name
8451 nt        ; Numeric_Type
8452 nv        ; Numeric_Value
8453 sfc       ; Simple_Case_Folding
8454 slc       ; Simple_Lowercase_Mapping
8455 stc       ; Simple_Titlecase_Mapping
8456 suc       ; Simple_Uppercase_Mapping
8457 tc        ; Titlecase_Mapping
8458 uc        ; Uppercase_Mapping
8459 END
8460
8461     if (-e 'Blocks.txt') {
8462         push @return, "blk       ; Block\n";
8463     }
8464     if (-e 'ArabicShaping.txt') {
8465         push @return, split /\n/, <<'END';
8466 jg        ; Joining_Group
8467 jt        ; Joining_Type
8468 END
8469     }
8470     if (-e 'PropList.txt') {
8471
8472         # This first set is in the original old-style proplist.
8473         push @return, split /\n/, <<'END';
8474 Alpha     ; Alphabetic
8475 Bidi_C    ; Bidi_Control
8476 Dash      ; Dash
8477 Dia       ; Diacritic
8478 Ext       ; Extender
8479 Hex       ; Hex_Digit
8480 Hyphen    ; Hyphen
8481 IDC       ; ID_Continue
8482 Ideo      ; Ideographic
8483 Join_C    ; Join_Control
8484 Math      ; Math
8485 QMark     ; Quotation_Mark
8486 Term      ; Terminal_Punctuation
8487 WSpace    ; White_Space
8488 END
8489         # The next sets were added later
8490         if ($v_version ge v3.0.0) {
8491             push @return, split /\n/, <<'END';
8492 Upper     ; Uppercase
8493 Lower     ; Lowercase
8494 END
8495         }
8496         if ($v_version ge v3.0.1) {
8497             push @return, split /\n/, <<'END';
8498 NChar     ; Noncharacter_Code_Point
8499 END
8500         }
8501         # The next sets were added in the new-style
8502         if ($v_version ge v3.1.0) {
8503             push @return, split /\n/, <<'END';
8504 OAlpha    ; Other_Alphabetic
8505 OLower    ; Other_Lowercase
8506 OMath     ; Other_Math
8507 OUpper    ; Other_Uppercase
8508 END
8509         }
8510         if ($v_version ge v3.1.1) {
8511             push @return, "AHex      ; ASCII_Hex_Digit\n";
8512         }
8513     }
8514     if (-e 'EastAsianWidth.txt') {
8515         push @return, "ea        ; East_Asian_Width\n";
8516     }
8517     if (-e 'CompositionExclusions.txt') {
8518         push @return, "CE        ; Composition_Exclusion\n";
8519     }
8520     if (-e 'LineBreak.txt') {
8521         push @return, "lb        ; Line_Break\n";
8522     }
8523     if (-e 'BidiMirroring.txt') {
8524         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
8525     }
8526     if (-e 'Scripts.txt') {
8527         push @return, "sc        ; Script\n";
8528     }
8529     if (-e 'DNormalizationProps.txt') {
8530         push @return, split /\n/, <<'END';
8531 Comp_Ex   ; Full_Composition_Exclusion
8532 FC_NFKC   ; FC_NFKC_Closure
8533 NFC_QC    ; NFC_Quick_Check
8534 NFD_QC    ; NFD_Quick_Check
8535 NFKC_QC   ; NFKC_Quick_Check
8536 NFKD_QC   ; NFKD_Quick_Check
8537 XO_NFC    ; Expands_On_NFC
8538 XO_NFD    ; Expands_On_NFD
8539 XO_NFKC   ; Expands_On_NFKC
8540 XO_NFKD   ; Expands_On_NFKD
8541 END
8542     }
8543     if (-e 'DCoreProperties.txt') {
8544         push @return, split /\n/, <<'END';
8545 IDS       ; ID_Start
8546 XIDC      ; XID_Continue
8547 XIDS      ; XID_Start
8548 END
8549         # These can also appear in some versions of PropList.txt
8550         push @return, "Lower     ; Lowercase\n"
8551                                     unless grep { $_ =~ /^Lower\b/} @return;
8552         push @return, "Upper     ; Uppercase\n"
8553                                     unless grep { $_ =~ /^Upper\b/} @return;
8554     }
8555
8556     # This flag requires the DAge.txt file to be copied into the directory.
8557     if (DEBUG && $compare_versions) {
8558         push @return, 'age       ; Age';
8559     }
8560
8561     return @return;
8562 }
8563
8564 sub process_PropValueAliases {
8565     # This file contains values that properties look like:
8566     # bc ; AL        ; Arabic_Letter
8567     # blk; n/a       ; Greek_And_Coptic                 ; Greek
8568     #
8569     # Field 0 is the property.
8570     # Field 1 is the short name of a property value or 'n/a' if no
8571     #                short name exists;
8572     # Field 2 is the full property value name;
8573     # Any other fields are more synonyms for the property value.
8574     # Purely numeric property values are omitted from the file; as are some
8575     # others, fewer and fewer in later releases
8576
8577     # Entries for the ccc property have an extra field before the
8578     # abbreviation:
8579     # ccc;   0; NR   ; Not_Reordered
8580     # It is the numeric value that the names are synonyms for.
8581
8582     # There are comment entries for values missing from this file:
8583     # # @missing: 0000..10FFFF; ISO_Comment; <none>
8584     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8585
8586     my $file= shift;
8587     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8588
8589     # This whole file was non-existent in early releases, so use our own
8590     # internal one if necessary.
8591     if (! -e 'PropValueAliases.txt') {
8592         $file->insert_lines(get_old_property_value_aliases());
8593     }
8594
8595     # Add any explicit cjk values
8596     $file->insert_lines(@cjk_property_values);
8597
8598     # This line is used only for testing the code that checks for name
8599     # conflicts.  There is a script Inherited, and when this line is executed
8600     # it causes there to be a name conflict with the 'Inherited' that this
8601     # program generates for this block property value
8602     #$file->insert_lines('blk; n/a; Herited');
8603
8604
8605     # Process each line of the file ...
8606     while ($file->next_line) {
8607
8608         my ($property, @data) = split /\s*;\s*/;
8609
8610         # The full name for the ccc property value is in field 2 of the
8611         # remaining ones; field 1 for all other properties.  Swap ccc fields 1
8612         # and 2.  (Rightmost splice removes field 2, returning it; left splice
8613         # inserts that into field 1, thus shifting former field 1 to field 2.)
8614         splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc';
8615
8616         # If there is no short name, use the full one in element 1
8617         $data[0] = $data[1] if $data[0] eq "n/a";
8618
8619         # Earlier releases had the pseudo property 'qc' that should expand to
8620         # the ones that replace it below.
8621         if ($property eq 'qc') {
8622             if (lc $data[0] eq 'y') {
8623                 $file->insert_lines('NFC_QC; Y      ; Yes',
8624                                     'NFD_QC; Y      ; Yes',
8625                                     'NFKC_QC; Y     ; Yes',
8626                                     'NFKD_QC; Y     ; Yes',
8627                                     );
8628             }
8629             elsif (lc $data[0] eq 'n') {
8630                 $file->insert_lines('NFC_QC; N      ; No',
8631                                     'NFD_QC; N      ; No',
8632                                     'NFKC_QC; N     ; No',
8633                                     'NFKD_QC; N     ; No',
8634                                     );
8635             }
8636             elsif (lc $data[0] eq 'm') {
8637                 $file->insert_lines('NFC_QC; M      ; Maybe',
8638                                     'NFKC_QC; M     ; Maybe',
8639                                     );
8640             }
8641             else {
8642                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8643             }
8644             next;
8645         }
8646
8647         # The first field is the short name, 2nd is the full one.
8648         my $property_object = property_ref($property);
8649         my $table = $property_object->add_match_table($data[0],
8650                                                 Full_Name => $data[1]);
8651
8652         # Start looking for more aliases after these two.
8653         for my $i (2 .. @data - 1) {
8654             $table->add_alias($data[$i]);
8655         }
8656     } # End of looping through the file
8657
8658     # As noted in the comments early in the program, it generates tables for
8659     # the default values for all releases, even those for which the concept
8660     # didn't exist at the time.  Here we add those if missing.
8661     my $age = property_ref('age');
8662     if (defined $age && ! defined $age->table('Unassigned')) {
8663         $age->add_match_table('Unassigned');
8664     }
8665     $block->add_match_table('No_Block') if -e 'Blocks.txt'
8666                                     && ! defined $block->table('No_Block');
8667
8668
8669     # Now set the default mappings of the properties from the file.  This is
8670     # done after the loop because a number of properties have only @missings
8671     # entries in the file, and may not show up until the end.
8672     my @defaults = $file->get_missings;
8673     foreach my $default_ref (@defaults) {
8674         my $default = $default_ref->[0];
8675         my $property = property_ref($default_ref->[1]);
8676         $property->set_default_map($default);
8677     }
8678     return;
8679 }
8680
8681 sub get_old_property_value_aliases () {
8682     # Returns what would be in PropValueAliases.txt if it existed in very old
8683     # versions of Unicode.  It was derived from the one in 3.2, and pared
8684     # down.  An attempt was made to use the existence of files to mean
8685     # inclusion or not of various aliases, but if this was not sufficient,
8686     # using version numbers was resorted to.
8687
8688     my @return = split /\n/, <<'END';
8689 bc ; AN        ; Arabic_Number
8690 bc ; B         ; Paragraph_Separator
8691 bc ; CS        ; Common_Separator
8692 bc ; EN        ; European_Number
8693 bc ; ES        ; European_Separator
8694 bc ; ET        ; European_Terminator
8695 bc ; L         ; Left_To_Right
8696 bc ; ON        ; Other_Neutral
8697 bc ; R         ; Right_To_Left
8698 bc ; WS        ; White_Space
8699
8700 # The standard combining classes are very much different in v1, so only use
8701 # ones that look right (not checked thoroughly)
8702 ccc;   0; NR   ; Not_Reordered
8703 ccc;   1; OV   ; Overlay
8704 ccc;   7; NK   ; Nukta
8705 ccc;   8; KV   ; Kana_Voicing
8706 ccc;   9; VR   ; Virama
8707 ccc; 202; ATBL ; Attached_Below_Left
8708 ccc; 216; ATAR ; Attached_Above_Right
8709 ccc; 218; BL   ; Below_Left
8710 ccc; 220; B    ; Below
8711 ccc; 222; BR   ; Below_Right
8712 ccc; 224; L    ; Left
8713 ccc; 228; AL   ; Above_Left
8714 ccc; 230; A    ; Above
8715 ccc; 232; AR   ; Above_Right
8716 ccc; 234; DA   ; Double_Above
8717
8718 dt ; can       ; canonical
8719 dt ; enc       ; circle
8720 dt ; fin       ; final
8721 dt ; font      ; font
8722 dt ; fra       ; fraction
8723 dt ; init      ; initial
8724 dt ; iso       ; isolated
8725 dt ; med       ; medial
8726 dt ; n/a       ; none
8727 dt ; nb        ; noBreak
8728 dt ; sqr       ; square
8729 dt ; sub       ; sub
8730 dt ; sup       ; super
8731
8732 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
8733 gc ; Cc        ; Control
8734 gc ; Cn        ; Unassigned
8735 gc ; Co        ; Private_Use
8736 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
8737 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
8738 gc ; Ll        ; Lowercase_Letter
8739 gc ; Lm        ; Modifier_Letter
8740 gc ; Lo        ; Other_Letter
8741 gc ; Lu        ; Uppercase_Letter
8742 gc ; M         ; Mark                             # Mc | Me | Mn
8743 gc ; Mc        ; Spacing_Mark
8744 gc ; Mn        ; Nonspacing_Mark
8745 gc ; N         ; Number                           # Nd | Nl | No
8746 gc ; Nd        ; Decimal_Number
8747 gc ; No        ; Other_Number
8748 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
8749 gc ; Pd        ; Dash_Punctuation
8750 gc ; Pe        ; Close_Punctuation
8751 gc ; Po        ; Other_Punctuation
8752 gc ; Ps        ; Open_Punctuation
8753 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
8754 gc ; Sc        ; Currency_Symbol
8755 gc ; Sm        ; Math_Symbol
8756 gc ; So        ; Other_Symbol
8757 gc ; Z         ; Separator                        # Zl | Zp | Zs
8758 gc ; Zl        ; Line_Separator
8759 gc ; Zp        ; Paragraph_Separator
8760 gc ; Zs        ; Space_Separator
8761
8762 nt ; de        ; Decimal
8763 nt ; di        ; Digit
8764 nt ; n/a       ; None
8765 nt ; nu        ; Numeric
8766 END
8767
8768     if (-e 'ArabicShaping.txt') {
8769         push @return, split /\n/, <<'END';
8770 jg ; n/a       ; AIN
8771 jg ; n/a       ; ALEF
8772 jg ; n/a       ; DAL
8773 jg ; n/a       ; GAF
8774 jg ; n/a       ; LAM
8775 jg ; n/a       ; MEEM
8776 jg ; n/a       ; NO_JOINING_GROUP
8777 jg ; n/a       ; NOON
8778 jg ; n/a       ; QAF
8779 jg ; n/a       ; SAD
8780 jg ; n/a       ; SEEN
8781 jg ; n/a       ; TAH
8782 jg ; n/a       ; WAW
8783
8784 jt ; C         ; Join_Causing
8785 jt ; D         ; Dual_Joining
8786 jt ; L         ; Left_Joining
8787 jt ; R         ; Right_Joining
8788 jt ; U         ; Non_Joining
8789 jt ; T         ; Transparent
8790 END
8791         if ($v_version ge v3.0.0) {
8792             push @return, split /\n/, <<'END';
8793 jg ; n/a       ; ALAPH
8794 jg ; n/a       ; BEH
8795 jg ; n/a       ; BETH
8796 jg ; n/a       ; DALATH_RISH
8797 jg ; n/a       ; E
8798 jg ; n/a       ; FEH
8799 jg ; n/a       ; FINAL_SEMKATH
8800 jg ; n/a       ; GAMAL
8801 jg ; n/a       ; HAH
8802 jg ; n/a       ; HAMZA_ON_HEH_GOAL
8803 jg ; n/a       ; HE
8804 jg ; n/a       ; HEH
8805 jg ; n/a       ; HEH_GOAL
8806 jg ; n/a       ; HETH
8807 jg ; n/a       ; KAF
8808 jg ; n/a       ; KAPH
8809 jg ; n/a       ; KNOTTED_HEH
8810 jg ; n/a       ; LAMADH
8811 jg ; n/a       ; MIM
8812 jg ; n/a       ; NUN
8813 jg ; n/a       ; PE
8814 jg ; n/a       ; QAPH
8815 jg ; n/a       ; REH
8816 jg ; n/a       ; REVERSED_PE
8817 jg ; n/a       ; SADHE
8818 jg ; n/a       ; SEMKATH
8819 jg ; n/a       ; SHIN
8820 jg ; n/a       ; SWASH_KAF
8821 jg ; n/a       ; TAW
8822 jg ; n/a       ; TEH_MARBUTA
8823 jg ; n/a       ; TETH
8824 jg ; n/a       ; YEH
8825 jg ; n/a       ; YEH_BARREE
8826 jg ; n/a       ; YEH_WITH_TAIL
8827 jg ; n/a       ; YUDH
8828 jg ; n/a       ; YUDH_HE
8829 jg ; n/a       ; ZAIN
8830 END
8831         }
8832     }
8833
8834
8835     if (-e 'EastAsianWidth.txt') {
8836         push @return, split /\n/, <<'END';
8837 ea ; A         ; Ambiguous
8838 ea ; F         ; Fullwidth
8839 ea ; H         ; Halfwidth
8840 ea ; N         ; Neutral
8841 ea ; Na        ; Narrow
8842 ea ; W         ; Wide
8843 END
8844     }
8845
8846     if (-e 'LineBreak.txt') {
8847         push @return, split /\n/, <<'END';
8848 lb ; AI        ; Ambiguous
8849 lb ; AL        ; Alphabetic
8850 lb ; B2        ; Break_Both
8851 lb ; BA        ; Break_After
8852 lb ; BB        ; Break_Before
8853 lb ; BK        ; Mandatory_Break
8854 lb ; CB        ; Contingent_Break
8855 lb ; CL        ; Close_Punctuation
8856 lb ; CM        ; Combining_Mark
8857 lb ; CR        ; Carriage_Return
8858 lb ; EX        ; Exclamation
8859 lb ; GL        ; Glue
8860 lb ; HY        ; Hyphen
8861 lb ; ID        ; Ideographic
8862 lb ; IN        ; Inseperable
8863 lb ; IS        ; Infix_Numeric
8864 lb ; LF        ; Line_Feed
8865 lb ; NS        ; Nonstarter
8866 lb ; NU        ; Numeric
8867 lb ; OP        ; Open_Punctuation
8868 lb ; PO        ; Postfix_Numeric
8869 lb ; PR        ; Prefix_Numeric
8870 lb ; QU        ; Quotation
8871 lb ; SA        ; Complex_Context
8872 lb ; SG        ; Surrogate
8873 lb ; SP        ; Space
8874 lb ; SY        ; Break_Symbols
8875 lb ; XX        ; Unknown
8876 lb ; ZW        ; ZWSpace
8877 END
8878     }
8879
8880     if (-e 'DNormalizationProps.txt') {
8881         push @return, split /\n/, <<'END';
8882 qc ; M         ; Maybe
8883 qc ; N         ; No
8884 qc ; Y         ; Yes
8885 END
8886     }
8887
8888     if (-e 'Scripts.txt') {
8889         push @return, split /\n/, <<'END';
8890 sc ; Arab      ; Arabic
8891 sc ; Armn      ; Armenian
8892 sc ; Beng      ; Bengali
8893 sc ; Bopo      ; Bopomofo
8894 sc ; Cans      ; Canadian_Aboriginal
8895 sc ; Cher      ; Cherokee
8896 sc ; Cyrl      ; Cyrillic
8897 sc ; Deva      ; Devanagari
8898 sc ; Dsrt      ; Deseret
8899 sc ; Ethi      ; Ethiopic
8900 sc ; Geor      ; Georgian
8901 sc ; Goth      ; Gothic
8902 sc ; Grek      ; Greek
8903 sc ; Gujr      ; Gujarati
8904 sc ; Guru      ; Gurmukhi
8905 sc ; Hang      ; Hangul
8906 sc ; Hani      ; Han
8907 sc ; Hebr      ; Hebrew
8908 sc ; Hira      ; Hiragana
8909 sc ; Ital      ; Old_Italic
8910 sc ; Kana      ; Katakana
8911 sc ; Khmr      ; Khmer
8912 sc ; Knda      ; Kannada
8913 sc ; Laoo      ; Lao
8914 sc ; Latn      ; Latin
8915 sc ; Mlym      ; Malayalam
8916 sc ; Mong      ; Mongolian
8917 sc ; Mymr      ; Myanmar
8918 sc ; Ogam      ; Ogham
8919 sc ; Orya      ; Oriya
8920 sc ; Qaai      ; Inherited
8921 sc ; Runr      ; Runic
8922 sc ; Sinh      ; Sinhala
8923 sc ; Syrc      ; Syriac
8924 sc ; Taml      ; Tamil
8925 sc ; Telu      ; Telugu
8926 sc ; Thaa      ; Thaana
8927 sc ; Thai      ; Thai
8928 sc ; Tibt      ; Tibetan
8929 sc ; Yiii      ; Yi
8930 sc ; Zyyy      ; Common
8931 END
8932     }
8933
8934     if ($v_version ge v2.0.0) {
8935         push @return, split /\n/, <<'END';
8936 dt ; com       ; compat
8937 dt ; nar       ; narrow
8938 dt ; sml       ; small
8939 dt ; vert      ; vertical
8940 dt ; wide      ; wide
8941
8942 gc ; Cf        ; Format
8943 gc ; Cs        ; Surrogate
8944 gc ; Lt        ; Titlecase_Letter
8945 gc ; Me        ; Enclosing_Mark
8946 gc ; Nl        ; Letter_Number
8947 gc ; Pc        ; Connector_Punctuation
8948 gc ; Sk        ; Modifier_Symbol
8949 END
8950     }
8951     if ($v_version ge v2.1.2) {
8952         push @return, "bc ; S         ; Segment_Separator\n";
8953     }
8954     if ($v_version ge v2.1.5) {
8955         push @return, split /\n/, <<'END';
8956 gc ; Pf        ; Final_Punctuation
8957 gc ; Pi        ; Initial_Punctuation
8958 END
8959     }
8960     if ($v_version ge v2.1.8) {
8961         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
8962     }
8963
8964     if ($v_version ge v3.0.0) {
8965         push @return, split /\n/, <<'END';
8966 bc ; AL        ; Arabic_Letter
8967 bc ; BN        ; Boundary_Neutral
8968 bc ; LRE       ; Left_To_Right_Embedding
8969 bc ; LRO       ; Left_To_Right_Override
8970 bc ; NSM       ; Nonspacing_Mark
8971 bc ; PDF       ; Pop_Directional_Format
8972 bc ; RLE       ; Right_To_Left_Embedding
8973 bc ; RLO       ; Right_To_Left_Override
8974
8975 ccc; 233; DB   ; Double_Below
8976 END
8977     }
8978
8979     if ($v_version ge v3.1.0) {
8980         push @return, "ccc; 226; R    ; Right\n";
8981     }
8982
8983     return @return;
8984 }
8985
8986 sub output_perl_charnames_line ($$) {
8987
8988     # Output the entries in Perl_charnames specially, using 5 digits instead
8989     # of four.  This makes the entries a constant length, and simplifies
8990     # charnames.pm which this table is for.  Unicode can have 6 digit
8991     # ordinals, but they are all private use or noncharacters which do not
8992     # have names, so won't be in this table.
8993
8994     return sprintf "%05X\t%s\n", $_[0], $_[1];
8995 }
8996
8997 { # Closure
8998     # This is used to store the range list of all the code points usable when
8999     # the little used $compare_versions feature is enabled.
9000     my $compare_versions_range_list;
9001
9002     sub process_generic_property_file {
9003         # This processes a file containing property mappings and puts them
9004         # into internal map tables.  It should be used to handle any property
9005         # files that have mappings from a code point or range thereof to
9006         # something else.  This means almost all the UCD .txt files.
9007         # each_line_handlers() should be set to adjust the lines of these
9008         # files, if necessary, to what this routine understands:
9009         #
9010         # 0374          ; NFD_QC; N
9011         # 003C..003E    ; Math
9012         #
9013         # the fields are: "codepoint-range ; property; map"
9014         #
9015         # meaning the codepoints in the range all have the value 'map' under
9016         # 'property'.
9017         # Beginning and trailing white space in each field are not significant.
9018         # Note there is not a trailing semi-colon in the above.  A trailing
9019         # semi-colon means the map is a null-string.  An omitted map, as
9020         # opposed to a null-string, is assumed to be 'Y', based on Unicode
9021         # table syntax.  (This could have been hidden from this routine by
9022         # doing it in the $file object, but that would require parsing of the
9023         # line there, so would have to parse it twice, or change the interface
9024         # to pass this an array.  So not done.)
9025         #
9026         # The map field may begin with a sequence of commands that apply to
9027         # this range.  Each such command begins and ends with $CMD_DELIM.
9028         # These are used to indicate, for example, that the mapping for a
9029         # range has a non-default type.
9030         #
9031         # This loops through the file, calling it's next_line() method, and
9032         # then taking the map and adding it to the property's table.
9033         # Complications arise because any number of properties can be in the
9034         # file, in any order, interspersed in any way.  The first time a
9035         # property is seen, it gets information about that property and
9036         # caches it for quick retrieval later.  It also normalizes the maps
9037         # so that only one of many synonyms is stored.  The Unicode input
9038         # files do use some multiple synonyms.
9039
9040         my $file = shift;
9041         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9042
9043         my %property_info;               # To keep track of what properties
9044                                          # have already had entries in the
9045                                          # current file, and info about each,
9046                                          # so don't have to recompute.
9047         my $property_name;               # property currently being worked on
9048         my $property_type;               # and its type
9049         my $previous_property_name = ""; # name from last time through loop
9050         my $property_object;             # pointer to the current property's
9051                                          # object
9052         my $property_addr;               # the address of that object
9053         my $default_map;                 # the string that code points missing
9054                                          # from the file map to
9055         my $default_table;               # For non-string properties, a
9056                                          # reference to the match table that
9057                                          # will contain the list of code
9058                                          # points that map to $default_map.
9059
9060         # Get the next real non-comment line
9061         LINE:
9062         while ($file->next_line) {
9063
9064             # Default replacement type; means that if parts of the range have
9065             # already been stored in our tables, the new map overrides them if
9066             # they differ more than cosmetically
9067             my $replace = $IF_NOT_EQUIVALENT;
9068             my $map_type;            # Default type for the map of this range
9069
9070             #local $to_trace = 1 if main::DEBUG;
9071             trace $_ if main::DEBUG && $to_trace;
9072
9073             # Split the line into components
9074             my ($range, $property_name, $map, @remainder)
9075                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9076
9077             # If more or less on the line than we are expecting, warn and skip
9078             # the line
9079             if (@remainder) {
9080                 $file->carp_bad_line('Extra fields');
9081                 next LINE;
9082             }
9083             elsif ( ! defined $property_name) {
9084                 $file->carp_bad_line('Missing property');
9085                 next LINE;
9086             }
9087
9088             # Examine the range.
9089             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9090             {
9091                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9092                 next LINE;
9093             }
9094             my $low = hex $1;
9095             my $high = (defined $2) ? hex $2 : $low;
9096
9097             # For the very specialized case of comparing two Unicode
9098             # versions...
9099             if (DEBUG && $compare_versions) {
9100                 if ($property_name eq 'Age') {
9101
9102                     # Only allow code points at least as old as the version
9103                     # specified.
9104                     my $age = pack "C*", split(/\./, $map);        # v string
9105                     next LINE if $age gt $compare_versions;
9106                 }
9107                 else {
9108
9109                     # Again, we throw out code points younger than those of
9110                     # the specified version.  By now, the Age property is
9111                     # populated.  We use the intersection of each input range
9112                     # with this property to find what code points in it are
9113                     # valid.   To do the intersection, we have to convert the
9114                     # Age property map to a Range_list.  We only have to do
9115                     # this once.
9116                     if (! defined $compare_versions_range_list) {
9117                         my $age = property_ref('Age');
9118                         if (! -e 'DAge.txt') {
9119                             croak "Need to have 'DAge.txt' file to do version comparison";
9120                         }
9121                         elsif ($age->count == 0) {
9122                             croak "The 'Age' table is empty, but its file exists";
9123                         }
9124                         $compare_versions_range_list
9125                                         = Range_List->new(Initialize => $age);
9126                     }
9127
9128                     # An undefined map is always 'Y'
9129                     $map = 'Y' if ! defined $map;
9130
9131                     # Calculate the intersection of the input range with the
9132                     # code points that are known in the specified version
9133                     my @ranges = ($compare_versions_range_list
9134                                   & Range->new($low, $high))->ranges;
9135
9136                     # If the intersection is empty, throw away this range
9137                     next LINE unless @ranges;
9138
9139                     # Only examine the first range this time through the loop.
9140                     my $this_range = shift @ranges;
9141
9142                     # Put any remaining ranges in the queue to be processed
9143                     # later.  Note that there is unnecessary work here, as we
9144                     # will do the intersection again for each of these ranges
9145                     # during some future iteration of the LINE loop, but this
9146                     # code is not used in production.  The later intersections
9147                     # are guaranteed to not splinter, so this will not become
9148                     # an infinite loop.
9149                     my $line = join ';', $property_name, $map;
9150                     foreach my $range (@ranges) {
9151                         $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9152                                                             $range->start,
9153                                                             $range->end,
9154                                                             $line));
9155                     }
9156
9157                     # And process the first range, like any other.
9158                     $low = $this_range->start;
9159                     $high = $this_range->end;
9160                 }
9161             } # End of $compare_versions
9162
9163             # If changing to a new property, get the things constant per
9164             # property
9165             if ($previous_property_name ne $property_name) {
9166
9167                 $property_object = property_ref($property_name);
9168                 if (! defined $property_object) {
9169                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
9170                     next LINE;
9171                 }
9172                 { no overloading; $property_addr = pack 'J', $property_object; }
9173
9174                 # Defer changing names until have a line that is acceptable
9175                 # (the 'next' statement above means is unacceptable)
9176                 $previous_property_name = $property_name;
9177
9178                 # If not the first time for this property, retrieve info about
9179                 # it from the cache
9180                 if (defined ($property_info{$property_addr}{'type'})) {
9181                     $property_type = $property_info{$property_addr}{'type'};
9182                     $default_map = $property_info{$property_addr}{'default'};
9183                     $map_type
9184                         = $property_info{$property_addr}{'pseudo_map_type'};
9185                     $default_table
9186                             = $property_info{$property_addr}{'default_table'};
9187                 }
9188                 else {
9189
9190                     # Here, is the first time for this property.  Set up the
9191                     # cache.
9192                     $property_type = $property_info{$property_addr}{'type'}
9193                                    = $property_object->type;
9194                     $map_type
9195                         = $property_info{$property_addr}{'pseudo_map_type'}
9196                         = $property_object->pseudo_map_type;
9197
9198                     # The Unicode files are set up so that if the map is not
9199                     # defined, it is a binary property
9200                     if (! defined $map && $property_type != $BINARY) {
9201                         if ($property_type != $UNKNOWN
9202                             && $property_type != $NON_STRING)
9203                         {
9204                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
9205                         }
9206                         else {
9207                             $property_object->set_type($BINARY);
9208                             $property_type
9209                                 = $property_info{$property_addr}{'type'}
9210                                 = $BINARY;
9211                         }
9212                     }
9213
9214                     # Get any @missings default for this property.  This
9215                     # should precede the first entry for the property in the
9216                     # input file, and is located in a comment that has been
9217                     # stored by the Input_file class until we access it here.
9218                     # It's possible that there is more than one such line
9219                     # waiting for us; collect them all, and parse
9220                     my @missings_list = $file->get_missings
9221                                             if $file->has_missings_defaults;
9222                     foreach my $default_ref (@missings_list) {
9223                         my $default = $default_ref->[0];
9224                         my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
9225
9226                         # For string properties, the default is just what the
9227                         # file says, but non-string properties should already
9228                         # have set up a table for the default property value;
9229                         # use the table for these, so can resolve synonyms
9230                         # later to a single standard one.
9231                         if ($property_type == $STRING
9232                             || $property_type == $UNKNOWN)
9233                         {
9234                             $property_info{$addr}{'missings'} = $default;
9235                         }
9236                         else {
9237                             $property_info{$addr}{'missings'}
9238                                         = $property_object->table($default);
9239                         }
9240                     }
9241
9242                     # Finished storing all the @missings defaults in the input
9243                     # file so far.  Get the one for the current property.
9244                     my $missings = $property_info{$property_addr}{'missings'};
9245
9246                     # But we likely have separately stored what the default
9247                     # should be.  (This is to accommodate versions of the
9248                     # standard where the @missings lines are absent or
9249                     # incomplete.)  Hopefully the two will match.  But check
9250                     # it out.
9251                     $default_map = $property_object->default_map;
9252
9253                     # If the map is a ref, it means that the default won't be
9254                     # processed until later, so undef it, so next few lines
9255                     # will redefine it to something that nothing will match
9256                     undef $default_map if ref $default_map;
9257
9258                     # Create a $default_map if don't have one; maybe a dummy
9259                     # that won't match anything.
9260                     if (! defined $default_map) {
9261
9262                         # Use any @missings line in the file.
9263                         if (defined $missings) {
9264                             if (ref $missings) {
9265                                 $default_map = $missings->full_name;
9266                                 $default_table = $missings;
9267                             }
9268                             else {
9269                                 $default_map = $missings;
9270                             }
9271
9272                             # And store it with the property for outside use.
9273                             $property_object->set_default_map($default_map);
9274                         }
9275                         else {
9276
9277                             # Neither an @missings nor a default map.  Create
9278                             # a dummy one, so won't have to test definedness
9279                             # in the main loop.
9280                             $default_map = '_Perl This will never be in a file
9281                                             from Unicode';
9282                         }
9283                     }
9284
9285                     # Here, we have $default_map defined, possibly in terms of
9286                     # $missings, but maybe not, and possibly is a dummy one.
9287                     if (defined $missings) {
9288
9289                         # Make sure there is no conflict between the two.
9290                         # $missings has priority.
9291                         if (ref $missings) {
9292                             $default_table
9293                                         = $property_object->table($default_map);
9294                             if (! defined $default_table
9295                                 || $default_table != $missings)
9296                             {
9297                                 if (! defined $default_table) {
9298                                     $default_table = $UNDEF;
9299                                 }
9300                                 $file->carp_bad_line(<<END
9301 The \@missings line for $property_name in $file says that missings default to
9302 $missings, but we expect it to be $default_table.  $missings used.
9303 END
9304                                 );
9305                                 $default_table = $missings;
9306                                 $default_map = $missings->full_name;
9307                             }
9308                             $property_info{$property_addr}{'default_table'}
9309                                                         = $default_table;
9310                         }
9311                         elsif ($default_map ne $missings) {
9312                             $file->carp_bad_line(<<END
9313 The \@missings line for $property_name in $file says that missings default to
9314 $missings, but we expect it to be $default_map.  $missings used.
9315 END
9316                             );
9317                             $default_map = $missings;
9318                         }
9319                     }
9320
9321                     $property_info{$property_addr}{'default'}
9322                                                     = $default_map;
9323
9324                     # If haven't done so already, find the table corresponding
9325                     # to this map for non-string properties.
9326                     if (! defined $default_table
9327                         && $property_type != $STRING
9328                         && $property_type != $UNKNOWN)
9329                     {
9330                         $default_table = $property_info{$property_addr}
9331                                                         {'default_table'}
9332                                     = $property_object->table($default_map);
9333                     }
9334                 } # End of is first time for this property
9335             } # End of switching properties.
9336
9337             # Ready to process the line.
9338             # The Unicode files are set up so that if the map is not defined,
9339             # it is a binary property with value 'Y'
9340             if (! defined $map) {
9341                 $map = 'Y';
9342             }
9343             else {
9344
9345                 # If the map begins with a special command to us (enclosed in
9346                 # delimiters), extract the command(s).
9347                 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9348                     my $command = $1;
9349                     if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
9350                         $replace = $1;
9351                     }
9352                     elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
9353                         $map_type = $1;
9354                     }
9355                     else {
9356                         $file->carp_bad_line("Unknown command line: '$1'");
9357                         next LINE;
9358                     }
9359                 }
9360             }
9361
9362             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9363             {
9364
9365                 # Here, we have a map to a particular code point, and the
9366                 # default map is to a code point itself.  If the range
9367                 # includes the particular code point, change that portion of
9368                 # the range to the default.  This makes sure that in the final
9369                 # table only the non-defaults are listed.
9370                 my $decimal_map = hex $map;
9371                 if ($low <= $decimal_map && $decimal_map <= $high) {
9372
9373                     # If the range includes stuff before or after the map
9374                     # we're changing, split it and process the split-off parts
9375                     # later.
9376                     if ($low < $decimal_map) {
9377                         $file->insert_adjusted_lines(
9378                                             sprintf("%04X..%04X; %s; %s",
9379                                                     $low,
9380                                                     $decimal_map - 1,
9381                                                     $property_name,
9382                                                     $map));
9383                     }
9384                     if ($high > $decimal_map) {
9385                         $file->insert_adjusted_lines(
9386                                             sprintf("%04X..%04X; %s; %s",
9387                                                     $decimal_map + 1,
9388                                                     $high,
9389                                                     $property_name,
9390                                                     $map));
9391                     }
9392                     $low = $high = $decimal_map;
9393                     $map = $CODE_POINT;
9394                 }
9395             }
9396
9397             # If we can tell that this is a synonym for the default map, use
9398             # the default one instead.
9399             if ($property_type != $STRING
9400                 && $property_type != $UNKNOWN)
9401             {
9402                 my $table = $property_object->table($map);
9403                 if (defined $table && $table == $default_table) {
9404                     $map = $default_map;
9405                 }
9406             }
9407
9408             # And figure out the map type if not known.
9409             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9410                 if ($map eq "") {   # Nulls are always $NULL map type
9411                     $map_type = $NULL;
9412                 } # Otherwise, non-strings, and those that don't allow
9413                   # $MULTI_CP, and those that aren't multiple code points are
9414                   # 0
9415                 elsif
9416                    (($property_type != $STRING && $property_type != $UNKNOWN)
9417                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9418                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
9419                 {
9420                     $map_type = 0;
9421                 }
9422                 else {
9423                     $map_type = $MULTI_CP;
9424                 }
9425             }
9426
9427             $property_object->add_map($low, $high,
9428                                         $map,
9429                                         Type => $map_type,
9430                                         Replace => $replace);
9431         } # End of loop through file's lines
9432
9433         return;
9434     }
9435 }
9436
9437 { # Closure for UnicodeData.txt handling
9438
9439     # This file was the first one in the UCD; its design leads to some
9440     # awkwardness in processing.  Here is a sample line:
9441     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9442     # The fields in order are:
9443     my $i = 0;            # The code point is in field 0, and is shifted off.
9444     my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
9445     my $CATEGORY = $i++;  # category (e.g. "Lu")
9446     my $CCC = $i++;       # Canonical combining class (e.g. "230")
9447     my $BIDI = $i++;      # directional class (e.g. "L")
9448     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
9449     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
9450     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9451                                          # Dual-use in this program; see below
9452     my $NUMERIC = $i++;   # numeric value
9453     my $MIRRORED = $i++;  # ? mirrored
9454     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9455     my $COMMENT = $i++;   # iso comment
9456     my $UPPER = $i++;     # simple uppercase mapping
9457     my $LOWER = $i++;     # simple lowercase mapping
9458     my $TITLE = $i++;     # simple titlecase mapping
9459     my $input_field_count = $i;
9460
9461     # This routine in addition outputs these extra fields:
9462     my $DECOMP_TYPE = $i++; # Decomposition type
9463
9464     # These fields are modifications of ones above, and are usually
9465     # suppressed; they must come last, as for speed, the loop upper bound is
9466     # normally set to ignore them
9467     my $NAME = $i++;        # This is the strict name field, not the one that
9468                             # charnames uses.
9469     my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
9470                             # by Unicode::Normalize
9471     my $last_field = $i - 1;
9472
9473     # All these are read into an array for each line, with the indices defined
9474     # above.  The empty fields in the example line above indicate that the
9475     # value is defaulted.  The handler called for each line of the input
9476     # changes these to their defaults.
9477
9478     # Here are the official names of the properties, in a parallel array:
9479     my @field_names;
9480     $field_names[$BIDI] = 'Bidi_Class';
9481     $field_names[$CATEGORY] = 'General_Category';
9482     $field_names[$CCC] = 'Canonical_Combining_Class';
9483     $field_names[$CHARNAME] = 'Perl_Charnames';
9484     $field_names[$COMMENT] = 'ISO_Comment';
9485     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9486     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
9487     $field_names[$LOWER] = 'Lowercase_Mapping';
9488     $field_names[$MIRRORED] = 'Bidi_Mirrored';
9489     $field_names[$NAME] = 'Name';
9490     $field_names[$NUMERIC] = 'Numeric_Value';
9491     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9492     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9493     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
9494     $field_names[$TITLE] = 'Titlecase_Mapping';
9495     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
9496     $field_names[$UPPER] = 'Uppercase_Mapping';
9497
9498     # Some of these need a little more explanation:
9499     # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9500     #   property, but is used in calculating the Numeric_Type.  Perl however,
9501     #   creates a file from this field, so a Perl property is created from it.
9502     # Similarly, the Other_Digit field is used only for calculating the
9503     #   Numeric_Type, and so it can be safely re-used as the place to store
9504     #   the value for Numeric_Type; hence it is referred to as
9505     #   $NUMERIC_TYPE_OTHER_DIGIT.
9506     # The input field named $PERL_DECOMPOSITION is a combination of both the
9507     #   decomposition mapping and its type.  Perl creates a file containing
9508     #   exactly this field, so it is used for that.  The two properties are
9509     #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9510     #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
9511     #   output it), as Perl doesn't use it directly.
9512     # The input field named here $CHARNAME is used to construct the
9513     #   Perl_Charnames property, which is a combination of the Name property
9514     #   (which the input field contains), and the Unicode_1_Name property, and
9515     #   others from other files.  Since, the strict Name property is not used
9516     #   by Perl, this field is used for the table that Perl does use.  The
9517     #   strict Name property table is usually suppressed (unless the lists are
9518     #   changed to output it), so it is accumulated in a separate field,
9519     #   $NAME, which to save time is discarded unless the table is actually to
9520     #   be output
9521
9522     # This file is processed like most in this program.  Control is passed to
9523     # process_generic_property_file() which calls filter_UnicodeData_line()
9524     # for each input line.  This filter converts the input into line(s) that
9525     # process_generic_property_file() understands.  There is also a setup
9526     # routine called before any of the file is processed, and a handler for
9527     # EOF processing, all in this closure.
9528
9529     # A huge speed-up occurred at the cost of some added complexity when these
9530     # routines were altered to buffer the outputs into ranges.  Almost all the
9531     # lines of the input file apply to just one code point, and for most
9532     # properties, the map for the next code point up is the same as the
9533     # current one.  So instead of creating a line for each property for each
9534     # input line, filter_UnicodeData_line() remembers what the previous map
9535     # of a property was, and doesn't generate a line to pass on until it has
9536     # to, as when the map changes; and that passed-on line encompasses the
9537     # whole contiguous range of code points that have the same map for that
9538     # property.  This means a slight amount of extra setup, and having to
9539     # flush these buffers on EOF, testing if the maps have changed, plus
9540     # remembering state information in the closure.  But it means a lot less
9541     # real time in not having to change the data base for each property on
9542     # each line.
9543
9544     # Another complication is that there are already a few ranges designated
9545     # in the input.  There are two lines for each, with the same maps except
9546     # the code point and name on each line.  This was actually the hardest
9547     # thing to design around.  The code points in those ranges may actually
9548     # have real maps not given by these two lines.  These maps will either
9549     # be algorithmically determinable, or in the extracted files furnished
9550     # with the UCD.  In the event of conflicts between these extracted files,
9551     # and this one, Unicode says that this one prevails.  But it shouldn't
9552     # prevail for conflicts that occur in these ranges.  The data from the
9553     # extracted files prevails in those cases.  So, this program is structured
9554     # so that those files are processed first, storing maps.  Then the other
9555     # files are processed, generally overwriting what the extracted files
9556     # stored.  But just the range lines in this input file are processed
9557     # without overwriting.  This is accomplished by adding a special string to
9558     # the lines output to tell process_generic_property_file() to turn off the
9559     # overwriting for just this one line.
9560     # A similar mechanism is used to tell it that the map is of a non-default
9561     # type.
9562
9563     sub setup_UnicodeData { # Called before any lines of the input are read
9564         my $file = shift;
9565         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9566
9567         # Create a new property specially located that is a combination of the
9568         # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9569         # Name_Alias properties.  (The final duplicates elements of the
9570         # first.)  A comment for it will later be constructed based on the
9571         # actual properties present and used
9572         $perl_charname = Property->new('Perl_Charnames',
9573                        Core_Access => '\N{...} and "use charnames"',
9574                        Default_Map => "",
9575                        Directory => File::Spec->curdir(),
9576                        File => 'Name',
9577                        Internal_Only_Warning => 1,
9578                        Perl_Extension => 1,
9579                        Range_Size_1 => \&output_perl_charnames_line,
9580                        Type => $STRING,
9581                        );
9582
9583         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
9584                                         Directory => File::Spec->curdir(),
9585                                         File => 'Decomposition',
9586                                         Format => $DECOMP_STRING_FORMAT,
9587                                         Internal_Only_Warning => 1,
9588                                         Perl_Extension => 1,
9589                                         Default_Map => $CODE_POINT,
9590
9591                                         # normalize.pm can't cope with these
9592                                         Output_Range_Counts => 0,
9593
9594                                         # This is a specially formatted table
9595                                         # explicitly for normalize.pm, which
9596                                         # is expecting a particular format,
9597                                         # which means that mappings containing
9598                                         # multiple code points are in the main
9599                                         # body of the table
9600                                         Map_Type => $COMPUTE_NO_MULTI_CP,
9601                                         Type => $STRING,
9602                                         );
9603         $Perl_decomp->add_comment(join_lines(<<END
9604 This mapping is a combination of the Unicode 'Decomposition_Type' and
9605 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
9606 identical to the official Unicode 'Decomposition_Mapping'  property except for
9607 two things:
9608  1) It omits the algorithmically determinable Hangul syllable decompositions,
9609 which normalize.pm handles algorithmically.
9610  2) It contains the decomposition type as well.  Non-canonical decompositions
9611 begin with a word in angle brackets, like <super>, which denotes the
9612 compatible decomposition type.  If the map does not begin with the <angle
9613 brackets>, the decomposition is canonical.
9614 END
9615         ));
9616
9617         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9618                                         Default_Map => "",
9619                                         Perl_Extension => 1,
9620                                         File => 'Digit',    # Trad. location
9621                                         Directory => $map_directory,
9622                                         Type => $STRING,
9623                                         Range_Size_1 => 1,
9624                                         );
9625         $Decimal_Digit->add_comment(join_lines(<<END
9626 This file gives the mapping of all code points which represent a single
9627 decimal digit [0-9] to their respective digits.  For example, the code point
9628 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
9629 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9630 numerals.
9631 END
9632         ));
9633
9634         # These properties are not used for generating anything else, and are
9635         # usually not output.  By making them last in the list, we can just
9636         # change the high end of the loop downwards to avoid the work of
9637         # generating a table(s) that is/are just going to get thrown away.
9638         if (! property_ref('Decomposition_Mapping')->to_output_map
9639             && ! property_ref('Name')->to_output_map)
9640         {
9641             $last_field = min($NAME, $DECOMP_MAP) - 1;
9642         } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
9643             $last_field = $DECOMP_MAP;
9644         } elsif (property_ref('Name')->to_output_map) {
9645             $last_field = $NAME;
9646         }
9647         return;
9648     }
9649
9650     my $first_time = 1;                 # ? Is this the first line of the file
9651     my $in_range = 0;                   # ? Are we in one of the file's ranges
9652     my $previous_cp;                    # hex code point of previous line
9653     my $decimal_previous_cp = -1;       # And its decimal equivalent
9654     my @start;                          # For each field, the current starting
9655                                         # code point in hex for the range
9656                                         # being accumulated.
9657     my @fields;                         # The input fields;
9658     my @previous_fields;                # And those from the previous call
9659
9660     sub filter_UnicodeData_line {
9661         # Handle a single input line from UnicodeData.txt; see comments above
9662         # Conceptually this takes a single line from the file containing N
9663         # properties, and converts it into N lines with one property per line,
9664         # which is what the final handler expects.  But there are
9665         # complications due to the quirkiness of the input file, and to save
9666         # time, it accumulates ranges where the property values don't change
9667         # and only emits lines when necessary.  This is about an order of
9668         # magnitude fewer lines emitted.
9669
9670         my $file = shift;
9671         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9672
9673         # $_ contains the input line.
9674         # -1 in split means retain trailing null fields
9675         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9676
9677         #local $to_trace = 1 if main::DEBUG;
9678         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9679         if (@fields > $input_field_count) {
9680             $file->carp_bad_line('Extra fields');
9681             $_ = "";
9682             return;
9683         }
9684
9685         my $decimal_cp = hex $cp;
9686
9687         # We have to output all the buffered ranges when the next code point
9688         # is not exactly one after the previous one, which means there is a
9689         # gap in the ranges.
9690         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9691
9692         # The decomposition mapping field requires special handling.  It looks
9693         # like either:
9694         #
9695         # <compat> 0032 0020
9696         # 0041 0300
9697         #
9698         # The decomposition type is enclosed in <brackets>; if missing, it
9699         # means the type is canonical.  There are two decomposition mapping
9700         # tables: the one for use by Perl's normalize.pm has a special format
9701         # which is this field intact; the other, for general use is of
9702         # standard format.  In either case we have to find the decomposition
9703         # type.  Empty fields have None as their type, and map to the code
9704         # point itself
9705         if ($fields[$PERL_DECOMPOSITION] eq "") {
9706             $fields[$DECOMP_TYPE] = 'None';
9707             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9708         }
9709         else {
9710             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9711                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
9712             if (! defined $fields[$DECOMP_TYPE]) {
9713                 $fields[$DECOMP_TYPE] = 'Canonical';
9714                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9715             }
9716             else {
9717                 $fields[$DECOMP_MAP] = $map;
9718             }
9719         }
9720
9721         # The 3 numeric fields also require special handling.  The 2 digit
9722         # fields must be either empty or match the number field.  This means
9723         # that if it is empty, they must be as well, and the numeric type is
9724         # None, and the numeric value is 'Nan'.
9725         # The decimal digit field must be empty or match the other digit
9726         # field.  If the decimal digit field is non-empty, the code point is
9727         # a decimal digit, and the other two fields will have the same value.
9728         # If it is empty, but the other digit field is non-empty, the code
9729         # point is an 'other digit', and the number field will have the same
9730         # value as the other digit field.  If the other digit field is empty,
9731         # but the number field is non-empty, the code point is a generic
9732         # numeric type.
9733         if ($fields[$NUMERIC] eq "") {
9734             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9735                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9736             ) {
9737                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
9738             }
9739             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9740             $fields[$NUMERIC] = 'NaN';
9741         }
9742         else {
9743             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9744             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9745                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9746                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9747             }
9748             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9749                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9750                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9751             }
9752             else {
9753                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9754
9755                 # Rationals require extra effort.
9756                 register_fraction($fields[$NUMERIC])
9757                                                 if $fields[$NUMERIC] =~ qr{/};
9758             }
9759         }
9760
9761         # For the properties that have empty fields in the file, and which
9762         # mean something different from empty, change them to that default.
9763         # Certain fields just haven't been empty so far in any Unicode
9764         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
9765         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
9766         # the defaults; which are very unlikely to ever change.
9767         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
9768         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
9769
9770         # UAX44 says that if title is empty, it is the same as whatever upper
9771         # is,
9772         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
9773
9774         # There are a few pairs of lines like:
9775         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
9776         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
9777         # that define ranges.  These should be processed after the fields are
9778         # adjusted above, as they may override some of them; but mostly what
9779         # is left is to possibly adjust the $CHARNAME field.  The names of all the
9780         # paired lines start with a '<', but this is also true of '<control>,
9781         # which isn't one of these special ones.
9782         if ($fields[$CHARNAME] eq '<control>') {
9783
9784             # Some code points in this file have the pseudo-name
9785             # '<control>', but the official name for such ones is the null
9786             # string.  For charnames.pm, we use the Unicode version 1 name
9787             $fields[$NAME] = "";
9788             $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
9789
9790             # We had better not be in between range lines.
9791             if ($in_range) {
9792                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
9793                 $in_range = 0;
9794             }
9795         }
9796         elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
9797
9798             # Here is a non-range line.  We had better not be in between range
9799             # lines.
9800             if ($in_range) {
9801                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
9802                 $in_range = 0;
9803             }
9804             if ($fields[$CHARNAME] =~ s/- $cp $//x) {
9805
9806                 # These are code points whose names end in their code points,
9807                 # which means the names are algorithmically derivable from the
9808                 # code points.  To shorten the output Name file, the algorithm
9809                 # for deriving these is placed in the file instead of each
9810                 # code point, so they have map type $CP_IN_NAME
9811                 $fields[$CHARNAME] = $CMD_DELIM
9812                                  . $MAP_TYPE_CMD
9813                                  . '='
9814                                  . $CP_IN_NAME
9815                                  . $CMD_DELIM
9816                                  . $fields[$CHARNAME];
9817             }
9818             $fields[$NAME] = $fields[$CHARNAME];
9819         }
9820         elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
9821             $fields[$CHARNAME] = $fields[$NAME] = $1;
9822
9823             # Here we are at the beginning of a range pair.
9824             if ($in_range) {
9825                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
9826             }
9827             $in_range = 1;
9828
9829             # Because the properties in the range do not overwrite any already
9830             # in the db, we must flush the buffers of what's already there, so
9831             # they get handled in the normal scheme.
9832             $force_output = 1;
9833
9834         }
9835         elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
9836             $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
9837             $_ = "";
9838             return;
9839         }
9840         else { # Here, we are at the last line of a range pair.
9841
9842             if (! $in_range) {
9843                 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
9844                 $_ = "";
9845                 return;
9846             }
9847             $in_range = 0;
9848
9849             $fields[$NAME] = $fields[$CHARNAME];
9850
9851             # Check that the input is valid: that the closing of the range is
9852             # the same as the beginning.
9853             foreach my $i (0 .. $last_field) {
9854                 next if $fields[$i] eq $previous_fields[$i];
9855                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
9856             }
9857
9858             # The processing differs depending on the type of range,
9859             # determined by its $CHARNAME
9860             if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
9861
9862                 # Check that the data looks right.
9863                 if ($decimal_previous_cp != $SBase) {
9864                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
9865                 }
9866                 if ($decimal_cp != $SBase + $SCount - 1) {
9867                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
9868                 }
9869
9870                 # The Hangul syllable range has a somewhat complicated name
9871                 # generation algorithm.  Each code point in it has a canonical
9872                 # decomposition also computable by an algorithm.  The
9873                 # perl decomposition map table built from these is used only
9874                 # by normalize.pm, which has the algorithm built in it, so the
9875                 # decomposition maps are not needed, and are large, so are
9876                 # omitted from it.  If the full decomposition map table is to
9877                 # be output, the decompositions are generated for it, in the
9878                 # EOF handling code for this input file.
9879
9880                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
9881
9882                 # This range is stored in our internal structure with its
9883                 # own map type, different from all others.
9884                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
9885                                         = $CMD_DELIM
9886                                           . $MAP_TYPE_CMD
9887                                           . '='
9888                                           . $HANGUL_SYLLABLE
9889                                           . $CMD_DELIM
9890                                           . $fields[$CHARNAME];
9891             }
9892             elsif ($fields[$CHARNAME] =~ /^CJK/) {
9893
9894                 # The name for these contains the code point itself, and all
9895                 # are defined to have the same base name, regardless of what
9896                 # is in the file.  They are stored in our internal structure
9897                 # with a map type of $CP_IN_NAME
9898                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
9899                                         = $CMD_DELIM
9900                                            . $MAP_TYPE_CMD
9901                                            . '='
9902                                            . $CP_IN_NAME
9903                                            . $CMD_DELIM
9904                                            . 'CJK UNIFIED IDEOGRAPH';
9905
9906             }
9907             elsif ($fields[$CATEGORY] eq 'Co'
9908                      || $fields[$CATEGORY] eq 'Cs')
9909             {
9910                 # The names of all the code points in these ranges are set to
9911                 # null, as there are no names for the private use and
9912                 # surrogate code points.
9913
9914                 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
9915             }
9916             else {
9917                 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
9918             }
9919
9920             # The first line of the range caused everything else to be output,
9921             # and then its values were stored as the beginning values for the
9922             # next set of ranges, which this one ends.  Now, for each value,
9923             # add a command to tell the handler that these values should not
9924             # replace any existing ones in our database.
9925             foreach my $i (0 .. $last_field) {
9926                 $previous_fields[$i] = $CMD_DELIM
9927                                         . $REPLACE_CMD
9928                                         . '='
9929                                         . $NO
9930                                         . $CMD_DELIM
9931                                         . $previous_fields[$i];
9932             }
9933
9934             # And change things so it looks like the entire range has been
9935             # gone through with this being the final part of it.  Adding the
9936             # command above to each field will cause this range to be flushed
9937             # during the next iteration, as it guaranteed that the stored
9938             # field won't match whatever value the next one has.
9939             $previous_cp = $cp;
9940             $decimal_previous_cp = $decimal_cp;
9941
9942             # We are now set up for the next iteration; so skip the remaining
9943             # code in this subroutine that does the same thing, but doesn't
9944             # know about these ranges.
9945             $_ = "";
9946
9947             return;
9948         }
9949
9950         # On the very first line, we fake it so the code below thinks there is
9951         # nothing to output, and initialize so that when it does get output it
9952         # uses the first line's values for the lowest part of the range.
9953         # (One could avoid this by using peek(), but then one would need to
9954         # know the adjustments done above and do the same ones in the setup
9955         # routine; not worth it)
9956         if ($first_time) {
9957             $first_time = 0;
9958             @previous_fields = @fields;
9959             @start = ($cp) x scalar @fields;
9960             $decimal_previous_cp = $decimal_cp - 1;
9961         }
9962
9963         # For each field, output the stored up ranges that this code point
9964         # doesn't fit in.  Earlier we figured out if all ranges should be
9965         # terminated because of changing the replace or map type styles, or if
9966         # there is a gap between this new code point and the previous one, and
9967         # that is stored in $force_output.  But even if those aren't true, we
9968         # need to output the range if this new code point's value for the
9969         # given property doesn't match the stored range's.
9970         #local $to_trace = 1 if main::DEBUG;
9971         foreach my $i (0 .. $last_field) {
9972             my $field = $fields[$i];
9973             if ($force_output || $field ne $previous_fields[$i]) {
9974
9975                 # Flush the buffer of stored values.
9976                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9977
9978                 # Start a new range with this code point and its value
9979                 $start[$i] = $cp;
9980                 $previous_fields[$i] = $field;
9981             }
9982         }
9983
9984         # Set the values for the next time.
9985         $previous_cp = $cp;
9986         $decimal_previous_cp = $decimal_cp;
9987
9988         # The input line has generated whatever adjusted lines are needed, and
9989         # should not be looked at further.
9990         $_ = "";
9991         return;
9992     }
9993
9994     sub EOF_UnicodeData {
9995         # Called upon EOF to flush the buffers, and create the Hangul
9996         # decomposition mappings if needed.
9997
9998         my $file = shift;
9999         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10000
10001         # Flush the buffers.
10002         foreach my $i (1 .. $last_field) {
10003             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10004         }
10005
10006         if (-e 'Jamo.txt') {
10007
10008             # The algorithm is published by Unicode, based on values in
10009             # Jamo.txt, (which should have been processed before this
10010             # subroutine), and the results left in %Jamo
10011             unless (%Jamo) {
10012                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
10013                 return;
10014             }
10015
10016             # If the full decomposition map table is being output, insert
10017             # into it the Hangul syllable mappings.  This is to avoid having
10018             # to publish a subroutine in it to compute them.  (which would
10019             # essentially be this code.)  This uses the algorithm published by
10020             # Unicode.
10021             if (property_ref('Decomposition_Mapping')->to_output_map) {
10022                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10023                     use integer;
10024                     my $SIndex = $S - $SBase;
10025                     my $L = $LBase + $SIndex / $NCount;
10026                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
10027                     my $T = $TBase + $SIndex % $TCount;
10028
10029                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10030                     my $decomposition = sprintf("%04X %04X", $L, $V);
10031                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10032                     $file->insert_adjusted_lines(
10033                                 sprintf("%04X; Decomposition_Mapping; %s",
10034                                         $S,
10035                                         $decomposition));
10036                 }
10037             }
10038         }
10039
10040         return;
10041     }
10042
10043     sub filter_v1_ucd {
10044         # Fix UCD lines in version 1.  This is probably overkill, but this
10045         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
10046         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
10047         #       removed.  This program retains them
10048         # 2)    didn't include ranges, which it should have, and which are now
10049         #       added in @corrected_lines below.  It was hand populated by
10050         #       taking the data from Version 2, verified by analyzing
10051         #       DAge.txt.
10052         # 3)    There is a syntax error in the entry for U+09F8 which could
10053         #       cause problems for utf8_heavy, and so is changed.  It's
10054         #       numeric value was simply a minus sign, without any number.
10055         #       (Eventually Unicode changed the code point to non-numeric.)
10056         # 4)    The decomposition types often don't match later versions
10057         #       exactly, and the whole syntax of that field is different; so
10058         #       the syntax is changed as well as the types to their later
10059         #       terminology.  Otherwise normalize.pm would be very unhappy
10060         # 5)    Many ccc classes are different.  These are left intact.
10061         # 6)    U+FF10 - U+FF19 are missing their numeric values in all three
10062         #       fields.  These are unchanged because it doesn't really cause
10063         #       problems for Perl.
10064         # 7)    A number of code points, such as controls, don't have their
10065         #       Unicode Version 1 Names in this file.  These are unchanged.
10066
10067         my @corrected_lines = split /\n/, <<'END';
10068 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
10069 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10070 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10071 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10072 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10073 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10074 END
10075
10076         my $file = shift;
10077         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10078
10079         #local $to_trace = 1 if main::DEBUG;
10080         trace $_ if main::DEBUG && $to_trace;
10081
10082         # -1 => retain trailing null fields
10083         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10084
10085         # At the first place that is wrong in the input, insert all the
10086         # corrections, replacing the wrong line.
10087         if ($code_point eq '4E00') {
10088             my @copy = @corrected_lines;
10089             $_ = shift @copy;
10090             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10091
10092             $file->insert_lines(@copy);
10093         }
10094
10095
10096         if ($fields[$NUMERIC] eq '-') {
10097             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
10098         }
10099
10100         if  ($fields[$PERL_DECOMPOSITION] ne "") {
10101
10102             # Several entries have this change to superscript 2 or 3 in the
10103             # middle.  Convert these to the modern version, which is to use
10104             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10105             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10106             # 'HHHH HHHH 00B3 HHHH'.
10107             # It turns out that all of these that don't have another
10108             # decomposition defined at the beginning of the line have the
10109             # <square> decomposition in later releases.
10110             if ($code_point ne '00B2' && $code_point ne '00B3') {
10111                 if  ($fields[$PERL_DECOMPOSITION]
10112                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10113                 {
10114                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10115                         $fields[$PERL_DECOMPOSITION] = '<square> '
10116                         . $fields[$PERL_DECOMPOSITION];
10117                     }
10118                 }
10119             }
10120
10121             # If is like '<+circled> 0052 <-circled>', convert to
10122             # '<circled> 0052'
10123             $fields[$PERL_DECOMPOSITION] =~
10124                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10125
10126             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10127             $fields[$PERL_DECOMPOSITION] =~
10128                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10129             or $fields[$PERL_DECOMPOSITION] =~
10130                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10131             or $fields[$PERL_DECOMPOSITION] =~
10132                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10133             or $fields[$PERL_DECOMPOSITION] =~
10134                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10135
10136             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10137             $fields[$PERL_DECOMPOSITION] =~
10138                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10139
10140             # Change names to modern form.
10141             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10142             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10143             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10144             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10145
10146             # One entry has weird braces
10147             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10148         }
10149
10150         $_ = join ';', $code_point, @fields;
10151         trace $_ if main::DEBUG && $to_trace;
10152         return;
10153     }
10154
10155     sub filter_v2_1_5_ucd {
10156         # A dozen entries in this 2.1.5 file had the mirrored and numeric
10157         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
10158         # column appears to be N, swap it back.
10159
10160         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10161         if ($fields[$NUMERIC] eq 'N') {
10162             $fields[$NUMERIC] = $fields[$MIRRORED];
10163             $fields[$MIRRORED] = 'N';
10164             $_ = join ';', $code_point, @fields;
10165         }
10166         return;
10167     }
10168
10169     sub filter_v6_ucd {
10170
10171         # Unicode 6.0 co-opted the name BELL for U+1F514, so change the input
10172         # to pretend that U+0007 is ALERT instead, and for Perl 5.14, don't
10173         # allow the BELL name for U+1F514, so that the old usage can be
10174         # deprecated for one cycle.
10175
10176         return if $_ !~ /^(?:0007|1F514|070F);/;
10177
10178         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10179         if ($code_point eq '0007') {
10180             $fields[$CHARNAME] = "ALERT";
10181         }
10182         elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10183                             # http://www.unicode.org/versions/corrigendum8.html
10184             $fields[$BIDI] = "AL";
10185         }
10186         elsif ($^V lt v5.15.0) { # For 5.16 will convert to use Unicode's name
10187             $fields[$CHARNAME] = "";
10188         }
10189
10190         $_ = join ';', $code_point, @fields;
10191
10192         return;
10193     }
10194 } # End closure for UnicodeData
10195
10196 sub process_GCB_test {
10197
10198     my $file = shift;
10199     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10200
10201     while ($file->next_line) {
10202         push @backslash_X_tests, $_;
10203     }
10204
10205     return;
10206 }
10207
10208 sub process_NamedSequences {
10209     # NamedSequences.txt entries are just added to an array.  Because these
10210     # don't look like the other tables, they have their own handler.
10211     # An example:
10212     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10213     #
10214     # This just adds the sequence to an array for later handling
10215
10216     my $file = shift;
10217     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10218
10219     while ($file->next_line) {
10220         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10221         if (@remainder) {
10222             $file->carp_bad_line(
10223                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10224             next;
10225         }
10226
10227         # Note single \t in keeping with special output format of
10228         # Perl_charnames.  But it turns out that the code points don't have to
10229         # be 5 digits long, like the rest, based on the internal workings of
10230         # charnames.pm.  This could be easily changed for consistency.
10231         push @named_sequences, "$sequence\t$name";
10232     }
10233     return;
10234 }
10235
10236 { # Closure
10237
10238     my $first_range;
10239
10240     sub  filter_early_ea_lb {
10241         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
10242         # third field be the name of the code point, which can be ignored in
10243         # most cases.  But it can be meaningful if it marks a range:
10244         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10245         # 3400;W;<CJK Ideograph Extension A, First>
10246         #
10247         # We need to see the First in the example above to know it's a range.
10248         # They did not use the later range syntaxes.  This routine changes it
10249         # to use the modern syntax.
10250         # $1 is the Input_file object.
10251
10252         my @fields = split /\s*;\s*/;
10253         if ($fields[2] =~ /^<.*, First>/) {
10254             $first_range = $fields[0];
10255             $_ = "";
10256         }
10257         elsif ($fields[2] =~ /^<.*, Last>/) {
10258             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10259         }
10260         else {
10261             undef $first_range;
10262             $_ = "$fields[0]; $fields[1]";
10263         }
10264
10265         return;
10266     }
10267 }
10268
10269 sub filter_old_style_arabic_shaping {
10270     # Early versions used a different term for the later one.
10271
10272     my @fields = split /\s*;\s*/;
10273     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10274     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
10275     $_ = join ';', @fields;
10276     return;
10277 }
10278
10279 sub filter_arabic_shaping_line {
10280     # ArabicShaping.txt has entries that look like:
10281     # 062A; TEH; D; BEH
10282     # The field containing 'TEH' is not used.  The next field is Joining_Type
10283     # and the last is Joining_Group
10284     # This generates two lines to pass on, one for each property on the input
10285     # line.
10286
10287     my $file = shift;
10288     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10289
10290     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10291
10292     if (@fields > 4) {
10293         $file->carp_bad_line('Extra fields');
10294         $_ = "";
10295         return;
10296     }
10297
10298     $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10299     $_ = "$fields[0]; Joining_Type; $fields[2]";
10300
10301     return;
10302 }
10303
10304 sub setup_special_casing {
10305     # SpecialCasing.txt contains the non-simple case change mappings.  The
10306     # simple ones are in UnicodeData.txt, which should already have been read
10307     # in to the full property data structures, so as to initialize these with
10308     # the simple ones.  Then the SpecialCasing.txt entries overwrite the ones
10309     # which have different full mappings.
10310
10311     # This routine sees if the simple mappings are to be output, and if so,
10312     # copies what has already been put into the full mapping tables, while
10313     # they still contain only the simple mappings.
10314
10315     # The reason it is done this way is that the simple mappings are probably
10316     # not going to be output, so it saves work to initialize the full tables
10317     # with the simple mappings, and then overwrite those relatively few
10318     # entries in them that have different full mappings, and thus skip the
10319     # simple mapping tables altogether.
10320
10321     my $file= shift;
10322     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10323
10324     # For each of the case change mappings...
10325     foreach my $case ('lc', 'tc', 'uc') {
10326         my $full = property_ref($case);
10327         unless (defined $full && ! $full->is_empty) {
10328             Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
10329         }
10330
10331         # The simple version's name in each mapping merely has an 's' in front
10332         # of the full one's
10333         my $simple = property_ref('s' . $case);
10334         $simple->initialize($full) if $simple->to_output_map();
10335     }
10336
10337     return;
10338 }
10339
10340 sub filter_special_casing_line {
10341     # Change the format of $_ from SpecialCasing.txt into something that the
10342     # generic handler understands.  Each input line contains three case
10343     # mappings.  This will generate three lines to pass to the generic handler
10344     # for each of those.
10345
10346     # The input syntax (after stripping comments and trailing white space is
10347     # like one of the following (with the final two being entries that we
10348     # ignore):
10349     # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10350     # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10351     # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10352     # Note the trailing semi-colon, unlike many of the input files.  That
10353     # means that there will be an extra null field generated by the split
10354
10355     my $file = shift;
10356     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10357
10358     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10359
10360     # field #4 is when this mapping is conditional.  If any of these get
10361     # implemented, it would be by hard-coding in the casing functions in the
10362     # Perl core, not through tables.  But if there is a new condition we don't
10363     # know about, output a warning.  We know about all the conditions through
10364     # 6.0
10365     if ($fields[4] ne "") {
10366         my @conditions = split ' ', $fields[4];
10367         if ($conditions[0] ne 'tr'  # We know that these languages have
10368                                     # conditions, and some are multiple
10369             && $conditions[0] ne 'az'
10370             && $conditions[0] ne 'lt'
10371
10372             # And, we know about a single condition Final_Sigma, but
10373             # nothing else.
10374             && ($v_version gt v5.2.0
10375                 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10376         {
10377             $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10378         }
10379         elsif ($conditions[0] ne 'Final_Sigma') {
10380
10381                 # Don't print out a message for Final_Sigma, because we have
10382                 # hard-coded handling for it.  (But the standard could change
10383                 # what the rule should be, but it wouldn't show up here
10384                 # anyway.
10385
10386                 print "# SKIPPING Special Casing: $_\n"
10387                                                     if $verbosity >= $VERBOSE;
10388         }
10389         $_ = "";
10390         return;
10391     }
10392     elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10393         $file->carp_bad_line('Extra fields');
10394         $_ = "";
10395         return;
10396     }
10397
10398     $_ = "$fields[0]; lc; $fields[1]";
10399     $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
10400     $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
10401
10402     return;
10403 }
10404
10405 sub filter_old_style_case_folding {
10406     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
10407     # and later style.  Different letters were used in the earlier.
10408
10409     my $file = shift;
10410     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10411
10412     my @fields = split /\s*;\s*/;
10413     if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10414         $fields[1] = 'I';
10415     }
10416     elsif ($fields[1] eq 'L') {
10417         $fields[1] = 'C';             # L => C always
10418     }
10419     elsif ($fields[1] eq 'E') {
10420         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
10421             $fields[1] = 'F'
10422         }
10423         else {
10424             $fields[1] = 'C'
10425         }
10426     }
10427     else {
10428         $file->carp_bad_line("Expecting L or E in second field");
10429         $_ = "";
10430         return;
10431     }
10432     $_ = join("; ", @fields) . ';';
10433     return;
10434 }
10435
10436 { # Closure for case folding
10437
10438     # Create the map for simple only if are going to output it, for otherwise
10439     # it takes no part in anything we do.
10440     my $to_output_simple;
10441
10442     # XXX
10443     # These are experimental, perhaps will need these to pass to regcomp.c to
10444     # handle the cases where for example the Kelvin sign character folds to k,
10445     # and in regcomp, we need to know which of the characters can have a
10446     # non-latin1 char fold to it, so it doesn't do the optimizations it might
10447     # otherwise.
10448     my @latin1_singly_folded;
10449     my @latin1_folded;
10450
10451     sub setup_case_folding($) {
10452         # Read in the case foldings in CaseFolding.txt.  This handles both
10453         # simple and full case folding.
10454
10455         $to_output_simple
10456                         = property_ref('Simple_Case_Folding')->to_output_map;
10457
10458         return;
10459     }
10460
10461     sub filter_case_folding_line {
10462         # Called for each line in CaseFolding.txt
10463         # Input lines look like:
10464         # 0041; C; 0061; # LATIN CAPITAL LETTER A
10465         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10466         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10467         #
10468         # 'C' means that folding is the same for both simple and full
10469         # 'F' that it is only for full folding
10470         # 'S' that it is only for simple folding
10471         # 'T' is locale-dependent, and ignored
10472         # 'I' is a type of 'F' used in some early releases.
10473         # Note the trailing semi-colon, unlike many of the input files.  That
10474         # means that there will be an extra null field generated by the split
10475         # below, which we ignore and hence is not an error.
10476
10477         my $file = shift;
10478         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10479
10480         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10481         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10482             $file->carp_bad_line('Extra fields');
10483             $_ = "";
10484             return;
10485         }
10486
10487         if ($type eq 'T') {   # Skip Turkic case folding, is locale dependent
10488             $_ = "";
10489             return;
10490         }
10491
10492         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
10493         # I are all full foldings
10494         if ($type eq 'C' || $type eq 'F' || $type eq 'I') {
10495             $_ = "$range; Case_Folding; $map";
10496         }
10497         else {
10498             $_ = "";
10499             if ($type ne 'S') {
10500                $file->carp_bad_line('Expecting C F I S or T in second field');
10501                return;
10502             }
10503         }
10504
10505         # C and S are simple foldings, but simple case folding is not needed
10506         # unless we explicitly want its map table output.
10507         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10508             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10509         }
10510
10511         # XXX Experimental, see comment above
10512         if ($type ne 'S' && hex($range) >= 256) {   # assumes range is 1 point
10513             my @folded = split ' ', $map;
10514             if (hex $folded[0] < 256 && @folded == 1) {
10515                 push @latin1_singly_folded, hex $folded[0];
10516             }
10517             foreach my $folded (@folded) {
10518                 push @latin1_folded, hex $folded if hex $folded < 256;
10519             }
10520         }
10521
10522         return;
10523     }
10524
10525     sub post_fold {
10526         # XXX Experimental, see comment above
10527         return;
10528
10529         #local $to_trace = 1 if main::DEBUG;
10530         @latin1_singly_folded = uniques(@latin1_singly_folded);
10531         @latin1_folded = uniques(@latin1_folded);
10532         trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace;
10533         trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace;
10534         return;
10535     }
10536 } # End case fold closure
10537
10538 sub filter_jamo_line {
10539     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
10540     # from this file that is used in generating the Name property for Jamo
10541     # code points.  But, it also is used to convert early versions' syntax
10542     # into the modern form.  Here are two examples:
10543     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
10544     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
10545     #
10546     # The input is $_, the output is $_ filtered.
10547
10548     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
10549
10550     # Let the caller handle unexpected input.  In earlier versions, there was
10551     # a third field which is supposed to be a comment, but did not have a '#'
10552     # before it.
10553     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10554
10555     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
10556                                 # beginning.
10557
10558     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
10559     $fields[1] = 'R' if $fields[0] eq '1105';
10560
10561     # Add to structure so can generate Names from it.
10562     my $cp = hex $fields[0];
10563     my $short_name = $fields[1];
10564     $Jamo{$cp} = $short_name;
10565     if ($cp <= $LBase + $LCount) {
10566         $Jamo_L{$short_name} = $cp - $LBase;
10567     }
10568     elsif ($cp <= $VBase + $VCount) {
10569         $Jamo_V{$short_name} = $cp - $VBase;
10570     }
10571     elsif ($cp <= $TBase + $TCount) {
10572         $Jamo_T{$short_name} = $cp - $TBase;
10573     }
10574     else {
10575         Carp::my_carp_bug("Unexpected Jamo code point in $_");
10576     }
10577
10578
10579     # Reassemble using just the first two fields to look like a typical
10580     # property file line
10581     $_ = "$fields[0]; $fields[1]";
10582
10583     return;
10584 }
10585
10586 sub register_fraction($) {
10587     # This registers the input rational number so that it can be passed on to
10588     # utf8_heavy.pl, both in rational and floating forms.
10589
10590     my $rational = shift;
10591
10592     my $float = eval $rational;
10593     $nv_floating_to_rational{$float} = $rational;
10594     return;
10595 }
10596
10597 sub filter_numeric_value_line {
10598     # DNumValues contains lines of a different syntax than the typical
10599     # property file:
10600     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
10601     #
10602     # This routine transforms $_ containing the anomalous syntax to the
10603     # typical, by filtering out the extra columns, and convert early version
10604     # decimal numbers to strings that look like rational numbers.
10605
10606     my $file = shift;
10607     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10608
10609     # Starting in 5.1, there is a rational field.  Just use that, omitting the
10610     # extra columns.  Otherwise convert the decimal number in the second field
10611     # to a rational, and omit extraneous columns.
10612     my @fields = split /\s*;\s*/, $_, -1;
10613     my $rational;
10614
10615     if ($v_version ge v5.1.0) {
10616         if (@fields != 4) {
10617             $file->carp_bad_line('Not 4 semi-colon separated fields');
10618             $_ = "";
10619             return;
10620         }
10621         $rational = $fields[3];
10622         $_ = join '; ', @fields[ 0, 3 ];
10623     }
10624     else {
10625
10626         # Here, is an older Unicode file, which has decimal numbers instead of
10627         # rationals in it.  Use the fraction to calculate the denominator and
10628         # convert to rational.
10629
10630         if (@fields != 2 && @fields != 3) {
10631             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10632             $_ = "";
10633             return;
10634         }
10635
10636         my $codepoints = $fields[0];
10637         my $decimal = $fields[1];
10638         if ($decimal =~ s/\.0+$//) {
10639
10640             # Anything ending with a decimal followed by nothing but 0's is an
10641             # integer
10642             $_ = "$codepoints; $decimal";
10643             $rational = $decimal;
10644         }
10645         else {
10646
10647             my $denominator;
10648             if ($decimal =~ /\.50*$/) {
10649                 $denominator = 2;
10650             }
10651
10652             # Here have the hardcoded repeating decimals in the fraction, and
10653             # the denominator they imply.  There were only a few denominators
10654             # in the older Unicode versions of this file which this code
10655             # handles, so it is easy to convert them.
10656
10657             # The 4 is because of a round-off error in the Unicode 3.2 files
10658             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10659                 $denominator = 3;
10660             }
10661             elsif ($decimal =~ /\.[27]50*$/) {
10662                 $denominator = 4;
10663             }
10664             elsif ($decimal =~ /\.[2468]0*$/) {
10665                 $denominator = 5;
10666             }
10667             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10668                 $denominator = 6;
10669             }
10670             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10671                 $denominator = 8;
10672             }
10673             if ($denominator) {
10674                 my $sign = ($decimal < 0) ? "-" : "";
10675                 my $numerator = int((abs($decimal) * $denominator) + .5);
10676                 $rational = "$sign$numerator/$denominator";
10677                 $_ = "$codepoints; $rational";
10678             }
10679             else {
10680                 $file->carp_bad_line("Can't cope with number '$decimal'.");
10681                 $_ = "";
10682                 return;
10683             }
10684         }
10685     }
10686
10687     register_fraction($rational) if $rational =~ qr{/};
10688     return;
10689 }
10690
10691 { # Closure
10692     my %unihan_properties;
10693     my $iicore;
10694
10695
10696     sub setup_unihan {
10697         # Do any special setup for Unihan properties.
10698
10699         # This property gives the wrong computed type, so override.
10700         my $usource = property_ref('kIRG_USource');
10701         $usource->set_type($STRING) if defined $usource;
10702
10703         # This property is to be considered binary, so change all the values
10704         # to Y.
10705         $iicore = property_ref('kIICore');
10706         if (defined $iicore) {
10707             $iicore->add_match_table('Y') if ! defined $iicore->table('Y');
10708
10709             # We have to change the default map, because the @missing line is
10710             # misleading, given that we are treating it as binary.
10711             $iicore->set_default_map('N');
10712             $iicore->set_type($BINARY);
10713         }
10714
10715         return;
10716     }
10717
10718     sub filter_unihan_line {
10719         # Change unihan db lines to look like the others in the db.  Here is
10720         # an input sample:
10721         #   U+341C        kCangjie        IEKN
10722
10723         # Tabs are used instead of semi-colons to separate fields; therefore
10724         # they may have semi-colons embedded in them.  Change these to periods
10725         # so won't screw up the rest of the code.
10726         s/;/./g;
10727
10728         # Remove lines that don't look like ones we accept.
10729         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10730             $_ = "";
10731             return;
10732         }
10733
10734         # Extract the property, and save a reference to its object.
10735         my $property = $1;
10736         if (! exists $unihan_properties{$property}) {
10737             $unihan_properties{$property} = property_ref($property);
10738         }
10739
10740         # Don't do anything unless the property is one we're handling, which
10741         # we determine by seeing if there is an object defined for it or not
10742         if (! defined $unihan_properties{$property}) {
10743             $_ = "";
10744             return;
10745         }
10746
10747         # The iicore property is supposed to be a boolean, so convert to our
10748         # standard boolean form.
10749         if (defined $iicore && $unihan_properties{$property} == $iicore) {
10750             $_ =~ s/$property.*/$property\tY/
10751         }
10752
10753         # Convert the tab separators to our standard semi-colons, and convert
10754         # the U+HHHH notation to the rest of the standard's HHHH
10755         s/\t/;/g;
10756         s/\b U \+ (?= $code_point_re )//xg;
10757
10758         #local $to_trace = 1 if main::DEBUG;
10759         trace $_ if main::DEBUG && $to_trace;
10760
10761         return;
10762     }
10763 }
10764
10765 sub filter_blocks_lines {
10766     # In the Blocks.txt file, the names of the blocks don't quite match the
10767     # names given in PropertyValueAliases.txt, so this changes them so they
10768     # do match:  Blanks and hyphens are changed into underscores.  Also makes
10769     # early release versions look like later ones
10770     #
10771     # $_ is transformed to the correct value.
10772
10773     my $file = shift;
10774         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10775
10776     if ($v_version lt v3.2.0) {
10777         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
10778             $_ = "";
10779             return;
10780         }
10781
10782         # Old versions used a different syntax to mark the range.
10783         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
10784     }
10785
10786     my @fields = split /\s*;\s*/, $_, -1;
10787     if (@fields != 2) {
10788         $file->carp_bad_line("Expecting exactly two fields");
10789         $_ = "";
10790         return;
10791     }
10792
10793     # Change hyphens and blanks in the block name field only
10794     $fields[1] =~ s/[ -]/_/g;
10795     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
10796
10797     $_ = join("; ", @fields);
10798     return;
10799 }
10800
10801 { # Closure
10802     my $current_property;
10803
10804     sub filter_old_style_proplist {
10805         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
10806         # was in a completely different syntax.  Ken Whistler of Unicode says
10807         # that it was something he used as an aid for his own purposes, but
10808         # was never an official part of the standard.  However, comments in
10809         # DAge.txt indicate that non-character code points were available in
10810         # the UCD as of 3.1.  It is unclear to me (khw) how they could be
10811         # there except through this file (but on the other hand, they first
10812         # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
10813         # not.  But the claim is that it was published as an aid to others who
10814         # might want some more information than was given in the official UCD
10815         # of the time.  Many of the properties in it were incorporated into
10816         # the later PropList.txt, but some were not.  This program uses this
10817         # early file to generate property tables that are otherwise not
10818         # accessible in the early UCD's, and most were probably not really
10819         # official at that time, so one could argue that it should be ignored,
10820         # and you can easily modify things to skip this.  And there are bugs
10821         # in this file in various versions.  (For example, the 2.1.9 version
10822         # removes from Alphabetic the CJK range starting at 4E00, and they
10823         # weren't added back in until 3.1.0.)  Many of this file's properties
10824         # were later sanctioned, so this code generates tables for those
10825         # properties that aren't otherwise in the UCD of the time but
10826         # eventually did become official, and throws away the rest.  Here is a
10827         # list of all the ones that are thrown away:
10828         #   Bidi=*                       duplicates UnicodeData.txt
10829         #   Combining                    never made into official property;
10830         #                                is \P{ccc=0}
10831         #   Composite                    never made into official property.
10832         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
10833         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
10834         #   Delimiter                    never made into official property;
10835         #                                removed in 3.0.1
10836         #   Format Control               never made into official property;
10837         #                                similar to gc=cf
10838         #   High Surrogate               duplicates Blocks.txt
10839         #   Ignorable Control            never made into official property;
10840         #                                similar to di=y
10841         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
10842         #   Left of Pair                 never made into official property;
10843         #   Line Separator               duplicates UnicodeData.txt: gc=zl
10844         #   Low Surrogate                duplicates Blocks.txt
10845         #   Non-break                    was actually listed as a property
10846         #                                in 3.2, but without any code
10847         #                                points.  Unicode denies that this
10848         #                                was ever an official property
10849         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
10850         #   Numeric                      duplicates UnicodeData.txt: gc=cc
10851         #   Paired Punctuation           never made into official property;
10852         #                                appears to be gc=ps + gc=pe
10853         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
10854         #   Private Use                  duplicates UnicodeData.txt: gc=co
10855         #   Private Use High Surrogate   duplicates Blocks.txt
10856         #   Punctuation                  duplicates UnicodeData.txt: gc=p
10857         #   Space                        different definition than eventual
10858         #                                one.
10859         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
10860         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cc
10861         #   Zero-width                   never made into official property;
10862         #                                subset of gc=cf
10863         # Most of the properties have the same names in this file as in later
10864         # versions, but a couple do not.
10865         #
10866         # This subroutine filters $_, converting it from the old style into
10867         # the new style.  Here's a sample of the old-style
10868         #
10869         #   *******************************************
10870         #
10871         #   Property dump for: 0x100000A0 (Join Control)
10872         #
10873         #   200C..200D  (2 chars)
10874         #
10875         # In the example, the property is "Join Control".  It is kept in this
10876         # closure between calls to the subroutine.  The numbers beginning with
10877         # 0x were internal to Ken's program that generated this file.
10878
10879         # If this line contains the property name, extract it.
10880         if (/^Property dump for: [^(]*\((.*)\)/) {
10881             $_ = $1;
10882
10883             # Convert white space to underscores.
10884             s/ /_/g;
10885
10886             # Convert the few properties that don't have the same name as
10887             # their modern counterparts
10888             s/Identifier_Part/ID_Continue/
10889             or s/Not_a_Character/NChar/;
10890
10891             # If the name matches an existing property, use it.
10892             if (defined property_ref($_)) {
10893                 trace "new property=", $_ if main::DEBUG && $to_trace;
10894                 $current_property = $_;
10895             }
10896             else {        # Otherwise discard it
10897                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
10898                 undef $current_property;
10899             }
10900             $_ = "";    # The property is saved for the next lines of the
10901                         # file, but this defining line is of no further use,
10902                         # so clear it so that the caller won't process it
10903                         # further.
10904         }
10905         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
10906
10907             # Here, the input line isn't a header defining a property for the
10908             # following section, and either we aren't in such a section, or
10909             # the line doesn't look like one that defines the code points in
10910             # such a section.  Ignore this line.
10911             $_ = "";
10912         }
10913         else {
10914
10915             # Here, we have a line defining the code points for the current
10916             # stashed property.  Anything starting with the first blank is
10917             # extraneous.  Otherwise, it should look like a normal range to
10918             # the caller.  Append the property name so that it looks just like
10919             # a modern PropList entry.
10920
10921             $_ =~ s/\s.*//;
10922             $_ .= "; $current_property";
10923         }
10924         trace $_ if main::DEBUG && $to_trace;
10925         return;
10926     }
10927 } # End closure for old style proplist
10928
10929 sub filter_old_style_normalization_lines {
10930     # For early releases of Unicode, the lines were like:
10931     #        74..2A76    ; NFKD_NO
10932     # For later releases this became:
10933     #        74..2A76    ; NFKD_QC; N
10934     # Filter $_ to look like those in later releases.
10935     # Similarly for MAYBEs
10936
10937     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
10938
10939     # Also, the property FC_NFKC was abbreviated to FNC
10940     s/FNC/FC_NFKC/;
10941     return;
10942 }
10943
10944 sub finish_Unicode() {
10945     # This routine should be called after all the Unicode files have been read
10946     # in.  It:
10947     # 1) Adds the mappings for code points missing from the files which have
10948     #    defaults specified for them.
10949     # 2) At this this point all mappings are known, so it computes the type of
10950     #    each property whose type hasn't been determined yet.
10951     # 3) Calculates all the regular expression match tables based on the
10952     #    mappings.
10953     # 3) Calculates and adds the tables which are defined by Unicode, but
10954     #    which aren't derived by them
10955
10956     # For each property, fill in any missing mappings, and calculate the re
10957     # match tables.  If a property has more than one missing mapping, the
10958     # default is a reference to a data structure, and requires data from other
10959     # properties to resolve.  The sort is used to cause these to be processed
10960     # last, after all the other properties have been calculated.
10961     # (Fortunately, the missing properties so far don't depend on each other.)
10962     foreach my $property
10963         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
10964         property_ref('*'))
10965     {
10966         # $perl has been defined, but isn't one of the Unicode properties that
10967         # need to be finished up.
10968         next if $property == $perl;
10969
10970         # Handle the properties that have more than one possible default
10971         if (ref $property->default_map) {
10972             my $default_map = $property->default_map;
10973
10974             # These properties have stored in the default_map:
10975             # One or more of:
10976             #   1)  A default map which applies to all code points in a
10977             #       certain class
10978             #   2)  an expression which will evaluate to the list of code
10979             #       points in that class
10980             # And
10981             #   3) the default map which applies to every other missing code
10982             #      point.
10983             #
10984             # Go through each list.
10985             while (my ($default, $eval) = $default_map->get_next_defaults) {
10986
10987                 # Get the class list, and intersect it with all the so-far
10988                 # unspecified code points yielding all the code points
10989                 # in the class that haven't been specified.
10990                 my $list = eval $eval;
10991                 if ($@) {
10992                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
10993                     last;
10994                 }
10995
10996                 # Narrow down the list to just those code points we don't have
10997                 # maps for yet.
10998                 $list = $list & $property->inverse_list;
10999
11000                 # Add mappings to the property for each code point in the list
11001                 foreach my $range ($list->ranges) {
11002                     $property->add_map($range->start, $range->end, $default,
11003                     Replace => $CROAK);
11004                 }
11005             }
11006
11007             # All remaining code points have the other mapping.  Set that up
11008             # so the normal single-default mapping code will work on them
11009             $property->set_default_map($default_map->other_default);
11010
11011             # And fall through to do that
11012         }
11013
11014         # We should have enough data now to compute the type of the property.
11015         $property->compute_type;
11016         my $property_type = $property->type;
11017
11018         next if ! $property->to_create_match_tables;
11019
11020         # Here want to create match tables for this property
11021
11022         # The Unicode db always (so far, and they claim into the future) have
11023         # the default for missing entries in binary properties be 'N' (unless
11024         # there is a '@missing' line that specifies otherwise)
11025         if ($property_type == $BINARY && ! defined $property->default_map) {
11026             $property->set_default_map('N');
11027         }
11028
11029         # Add any remaining code points to the mapping, using the default for
11030         # missing code points.
11031         if (defined (my $default_map = $property->default_map)) {
11032
11033             # This fills in any missing values with the default.
11034             $property->add_map(0, $LAST_UNICODE_CODEPOINT,
11035                                $default_map, Replace => $NO);
11036
11037             # Make sure there is a match table for the default
11038             if (! defined $property->table($default_map)) {
11039                 $property->add_match_table($default_map);
11040             }
11041         }
11042
11043         # Have all we need to populate the match tables.
11044         my $property_name = $property->name;
11045         foreach my $range ($property->ranges) {
11046             my $map = $range->value;
11047             my $table = property_ref($property_name)->table($map);
11048             if (! defined $table) {
11049
11050                 # Integral and rational property values are not necessarily
11051                 # defined in PropValueAliases, but all other ones should be,
11052                 # starting in 5.1
11053                 if ($v_version ge v5.1.0
11054                     && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
11055                 {
11056                     Carp::my_carp("Table '$property_name=$map' should have been defined.  Defining it now.")
11057                 }
11058                 $table = property_ref($property_name)->add_match_table($map);
11059             }
11060
11061             $table->add_range($range->start, $range->end);
11062         }
11063
11064         # And add the Is_ prefix synonyms for Perl 5.6 compatibility, in which
11065         # all properties have this optional prefix.  These do not get a
11066         # separate entry in the pod file, because are covered by a wild-card
11067         # entry
11068         foreach my $alias ($property->aliases) {
11069             my $Is_name = 'Is_' . $alias->name;
11070             if (! defined (my $pre_existing = property_ref($Is_name))) {
11071                 $property->add_alias($Is_name,
11072                                      Pod_Entry => 0,
11073                                      Status => $alias->status,
11074                                      Externally_Ok => 0);
11075             }
11076             else {
11077
11078                 # It seemed too much work to add in these warnings when it
11079                 # appears that Unicode has made a decision never to begin a
11080                 # property name with 'Is_', so this shouldn't happen, but just
11081                 # in case, it is a warning.
11082                 Carp::my_carp(<<END
11083 There is already an alias named $Is_name (from " . $pre_existing . "), so not
11084 creating this alias for $property.  The generated table and pod files do not
11085 warn users of this conflict.
11086 END
11087                 );
11088                 $has_Is_conflicts++;
11089             }
11090         } # End of loop through aliases for this property
11091     } # End of loop through all Unicode properties.
11092
11093     # Fill in the mappings that Unicode doesn't completely furnish.  First the
11094     # single letter major general categories.  If Unicode were to start
11095     # delivering the values, this would be redundant, but better that than to
11096     # try to figure out if should skip and not get it right.  Ths could happen
11097     # if a new major category were to be introduced, and the hard-coded test
11098     # wouldn't know about it.
11099     # This routine depends on the standard names for the general categories
11100     # being what it thinks they are, like 'Cn'.  The major categories are the
11101     # union of all the general category tables which have the same first
11102     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11103     foreach my $minor_table ($gc->tables) {
11104         my $minor_name = $minor_table->name;
11105         next if length $minor_name == 1;
11106         if (length $minor_name != 2) {
11107             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
11108             next;
11109         }
11110
11111         my $major_name = uc(substr($minor_name, 0, 1));
11112         my $major_table = $gc->table($major_name);
11113         $major_table += $minor_table;
11114     }
11115
11116     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
11117     # defines it as LC)
11118     my $LC = $gc->table('LC');
11119     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
11120     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
11121
11122
11123     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11124                          # deliver the correct values in it
11125         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11126
11127         # Lt not in release 1.
11128         $LC += $gc->table('Lt') if defined $gc->table('Lt');
11129     }
11130     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11131
11132     my $Cs = $gc->table('Cs');
11133     if (defined $Cs) {
11134         $Cs->add_note('Mostly not usable in Perl.');
11135         $Cs->add_comment(join_lines(<<END
11136 Surrogates are used exclusively for I/O in UTF-16, and should not appear in
11137 Unicode text, and hence their use will generate (usually fatal) messages
11138 END
11139         ));
11140     }
11141
11142
11143     # Folding information was introduced later into Unicode data.  To get
11144     # Perl's case ignore (/i) to work at all in releases that don't have
11145     # folding, use the best available alternative, which is lower casing.
11146     my $fold = property_ref('Simple_Case_Folding');
11147     if ($fold->is_empty) {
11148         $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11149         $fold->add_note(join_lines(<<END
11150 WARNING: This table uses lower case as a substitute for missing fold
11151 information
11152 END
11153         ));
11154     }
11155
11156     # Multiple-character mapping was introduced later into Unicode data.  If
11157     # missing, use the single-characters maps as best available alternative
11158     foreach my $map (qw {   Uppercase_Mapping
11159                             Lowercase_Mapping
11160                             Titlecase_Mapping
11161                             Case_Folding
11162                         } ) {
11163         my $full = property_ref($map);
11164         if ($full->is_empty) {
11165             my $simple = property_ref('Simple_' . $map);
11166             $full->initialize($simple);
11167             $full->add_comment($simple->comment) if ($simple->comment);
11168             $full->add_note(join_lines(<<END
11169 WARNING: This table uses simple mapping (single-character only) as a
11170 substitute for missing multiple-character information
11171 END
11172             ));
11173         }
11174     }
11175     return
11176 }
11177
11178 sub compile_perl() {
11179     # Create perl-defined tables.  Almost all are part of the pseudo-property
11180     # named 'perl' internally to this program.  Many of these are recommended
11181     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
11182     # on those found there.
11183     # Almost all of these are equivalent to some Unicode property.
11184     # A number of these properties have equivalents restricted to the ASCII
11185     # range, with their names prefaced by 'Posix', to signify that these match
11186     # what the Posix standard says they should match.  A couple are
11187     # effectively this, but the name doesn't have 'Posix' in it because there
11188     # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
11189     # to the full Unicode range, by our guesses as to what is appropriate.
11190
11191     # 'Any' is all code points.  As an error check, instead of just setting it
11192     # to be that, construct it to be the union of all the major categories
11193     my $Any = $perl->add_match_table('Any',
11194             Description  => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
11195             Matches_All => 1);
11196
11197     foreach my $major_table ($gc->tables) {
11198
11199         # Major categories are the ones with single letter names.
11200         next if length($major_table->name) != 1;
11201
11202         $Any += $major_table;
11203     }
11204
11205     if ($Any->max != $LAST_UNICODE_CODEPOINT) {
11206         Carp::my_carp_bug("Generated highest code point ("
11207            . sprintf("%X", $Any->max)
11208            . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
11209     }
11210     if ($Any->range_count != 1 || $Any->min != 0) {
11211      Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
11212     }
11213
11214     $Any->add_alias('All');
11215
11216     # Assigned is the opposite of gc=unassigned
11217     my $Assigned = $perl->add_match_table('Assigned',
11218                                 Description  => "All assigned code points",
11219                                 Initialize => ~ $gc->table('Unassigned'),
11220                                 );
11221
11222     # Our internal-only property should be treated as more than just a
11223     # synonym.
11224     $perl->add_match_table('_CombAbove')
11225             ->set_equivalent_to(property_ref('ccc')->table('Above'),
11226                                                                 Related => 1);
11227
11228     my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
11229     if (defined $block) {   # This is equivalent to the block if have it.
11230         my $Unicode_ASCII = $block->table('Basic_Latin');
11231         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
11232             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
11233         }
11234     }
11235
11236     # Very early releases didn't have blocks, so initialize ASCII ourselves if
11237     # necessary
11238     if ($ASCII->is_empty) {
11239         $ASCII->initialize([ 0..127 ]);
11240     }
11241
11242     # Get the best available case definitions.  Early Unicode versions didn't
11243     # have Uppercase and Lowercase defined, so use the general category
11244     # instead for them.
11245     my $Lower = $perl->add_match_table('Lower');
11246     my $Unicode_Lower = property_ref('Lowercase');
11247     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
11248         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
11249     }
11250     else {
11251         $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
11252                                                                 Related => 1);
11253     }
11254     $Lower->add_alias('XPosixLower');
11255     $perl->add_match_table("PosixLower",
11256                             Description => "[a-z]",
11257                             Initialize => $Lower & $ASCII,
11258                             );
11259
11260     my $Upper = $perl->add_match_table('Upper');
11261     my $Unicode_Upper = property_ref('Uppercase');
11262     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
11263         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
11264     }
11265     else {
11266         $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
11267                                                                 Related => 1);
11268     }
11269     $Upper->add_alias('XPosixUpper');
11270     $perl->add_match_table("PosixUpper",
11271                             Description => "[A-Z]",
11272                             Initialize => $Upper & $ASCII,
11273                             );
11274
11275     # Earliest releases didn't have title case.  Initialize it to empty if not
11276     # otherwise present
11277     my $Title = $perl->add_match_table('Title');
11278     my $lt = $gc->table('Lt');
11279     if (defined $lt) {
11280         $Title->set_equivalent_to($lt, Related => 1);
11281     }
11282
11283     # If this Unicode version doesn't have Cased, set up our own.  From
11284     # Unicode 5.1: Definition D120: A character C is defined to be cased if
11285     # and only if C has the Lowercase or Uppercase property or has a
11286     # General_Category value of Titlecase_Letter.
11287     unless (defined property_ref('Cased')) {
11288         my $cased = $perl->add_match_table('Cased',
11289                         Initialize => $Lower + $Upper + $Title,
11290                         Description => 'Uppercase or Lowercase or Titlecase',
11291                         );
11292     }
11293
11294     # Similarly, set up our own Case_Ignorable property if this Unicode
11295     # version doesn't have it.  From Unicode 5.1: Definition D121: A character
11296     # C is defined to be case-ignorable if C has the value MidLetter or the
11297     # value MidNumLet for the Word_Break property or its General_Category is
11298     # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
11299     # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11300
11301     # Perl has long had an internal-only alias for this property.
11302     my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
11303     my $case_ignorable = property_ref('Case_Ignorable');
11304     if (defined $case_ignorable && ! $case_ignorable->is_empty) {
11305         $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
11306                                                                 Related => 1);
11307     }
11308     else {
11309
11310         $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
11311
11312         # The following three properties are not in early releases
11313         $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
11314         $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
11315         $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
11316
11317         # For versions 4.1 - 5.0, there is no MidNumLet property, and
11318         # correspondingly the case-ignorable definition lacks that one.  For
11319         # 4.0, it appears that it was meant to be the same definition, but was
11320         # inadvertently omitted from the standard's text, so add it if the
11321         # property actually is there
11322         my $wb = property_ref('Word_Break');
11323         if (defined $wb) {
11324             my $midlet = $wb->table('MidLetter');
11325             $perl_case_ignorable += $midlet if defined $midlet;
11326             my $midnumlet = $wb->table('MidNumLet');
11327             $perl_case_ignorable += $midnumlet if defined $midnumlet;
11328         }
11329         else {
11330
11331             # In earlier versions of the standard, instead of the above two
11332             # properties , just the following characters were used:
11333             $perl_case_ignorable +=  0x0027  # APOSTROPHE
11334                                 +   0x00AD  # SOFT HYPHEN (SHY)
11335                                 +   0x2019; # RIGHT SINGLE QUOTATION MARK
11336         }
11337     }
11338
11339     # The remaining perl defined tables are mostly based on Unicode TR 18,
11340     # "Annex C: Compatibility Properties".  All of these have two versions,
11341     # one whose name generally begins with Posix that is posix-compliant, and
11342     # one that matches Unicode characters beyond the Posix, ASCII range
11343
11344     my $Alpha = $perl->add_match_table('Alpha');
11345
11346     # Alphabetic was not present in early releases
11347     my $Alphabetic = property_ref('Alphabetic');
11348     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
11349         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
11350     }
11351     else {
11352
11353         # For early releases, we don't get it exactly right.  The below
11354         # includes more than it should, which in 5.2 terms is: L + Nl +
11355         # Other_Alphabetic.  Other_Alphabetic contains many characters from
11356         # Mn and Mc.  It's better to match more than we should, than less than
11357         # we should.
11358         $Alpha->initialize($gc->table('Letter')
11359                             + $gc->table('Mn')
11360                             + $gc->table('Mc'));
11361         $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
11362         $Alpha->add_description('Alphabetic');
11363     }
11364     $Alpha->add_alias('XPosixAlpha');
11365     $perl->add_match_table("PosixAlpha",
11366                             Description => "[A-Za-z]",
11367                             Initialize => $Alpha & $ASCII,
11368                             );
11369
11370     my $Alnum = $perl->add_match_table('Alnum',
11371                         Description => 'Alphabetic and (Decimal) Numeric',
11372                         Initialize => $Alpha + $gc->table('Decimal_Number'),
11373                         );
11374     $Alnum->add_alias('XPosixAlnum');
11375     $perl->add_match_table("PosixAlnum",
11376                             Description => "[A-Za-z0-9]",
11377                             Initialize => $Alnum & $ASCII,
11378                             );
11379
11380     my $Word = $perl->add_match_table('Word',
11381                                 Description => '\w, including beyond ASCII;'
11382                                             . ' = \p{Alnum} + \pM + \p{Pc}',
11383                                 Initialize => $Alnum + $gc->table('Mark'),
11384                                 );
11385     $Word->add_alias('XPosixWord');
11386     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
11387     $Word += $Pc if defined $Pc;
11388
11389     # This is a Perl extension, so the name doesn't begin with Posix.
11390     my $PerlWord = $perl->add_match_table('PerlWord',
11391                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
11392                     Initialize => $Word & $ASCII,
11393                     );
11394     $PerlWord->add_alias('PosixWord');
11395
11396     my $Blank = $perl->add_match_table('Blank',
11397                                 Description => '\h, Horizontal white space',
11398
11399                                 # 200B is Zero Width Space which is for line
11400                                 # break control, and was listed as
11401                                 # Space_Separator in early releases
11402                                 Initialize => $gc->table('Space_Separator')
11403                                             +   0x0009  # TAB
11404                                             -   0x200B, # ZWSP
11405                                 );
11406     $Blank->add_alias('HorizSpace');        # Another name for it.
11407     $Blank->add_alias('XPosixBlank');
11408     $perl->add_match_table("PosixBlank",
11409                             Description => "\\t and ' '",
11410                             Initialize => $Blank & $ASCII,
11411                             );
11412
11413     my $VertSpace = $perl->add_match_table('VertSpace',
11414                             Description => '\v',
11415                             Initialize => $gc->table('Line_Separator')
11416                                         + $gc->table('Paragraph_Separator')
11417                                         + 0x000A  # LINE FEED
11418                                         + 0x000B  # VERTICAL TAB
11419                                         + 0x000C  # FORM FEED
11420                                         + 0x000D  # CARRIAGE RETURN
11421                                         + 0x0085, # NEL
11422                             );
11423     # No Posix equivalent for vertical space
11424
11425     my $Space = $perl->add_match_table('Space',
11426                 Description => '\s including beyond ASCII plus vertical tab',
11427                 Initialize => $Blank + $VertSpace,
11428     );
11429     $Space->add_alias('XPosixSpace');
11430     $perl->add_match_table("PosixSpace",
11431                             Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
11432                             Initialize => $Space & $ASCII,
11433                             );
11434
11435     # Perl's traditional space doesn't include Vertical Tab
11436     my $XPerlSpace = $perl->add_match_table('XPerlSpace',
11437                                   Description => '\s, including beyond ASCII',
11438                                   Initialize => $Space - 0x000B,
11439                                 );
11440     $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
11441     my $PerlSpace = $perl->add_match_table('PerlSpace',
11442                             Description => '\s, restricted to ASCII',
11443                             Initialize => $XPerlSpace & $ASCII,
11444                             );
11445
11446
11447     my $Cntrl = $perl->add_match_table('Cntrl',
11448                                         Description => 'Control characters');
11449     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
11450     $Cntrl->add_alias('XPosixCntrl');
11451     $perl->add_match_table("PosixCntrl",
11452                             Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
11453                             Initialize => $Cntrl & $ASCII,
11454                             );
11455
11456     # $controls is a temporary used to construct Graph.
11457     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
11458                                                 + $gc->table('Control'));
11459     # Cs not in release 1
11460     $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
11461
11462     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
11463     my $Graph = $perl->add_match_table('Graph',
11464                         Description => 'Characters that are graphical',
11465                         Initialize => ~ ($Space + $controls),
11466                         );
11467     $Graph->add_alias('XPosixGraph');
11468     $perl->add_match_table("PosixGraph",
11469                             Description =>
11470                                 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
11471                             Initialize => $Graph & $ASCII,
11472                             );
11473
11474     $print = $perl->add_match_table('Print',
11475                         Description => 'Characters that are graphical plus space characters (but no controls)',
11476                         Initialize => $Blank + $Graph - $gc->table('Control'),
11477                         );
11478     $print->add_alias('XPosixPrint');
11479     $perl->add_match_table("PosixPrint",
11480                             Description =>
11481                               '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
11482                             Initialize => $print & $ASCII,
11483                             );
11484
11485     my $Punct = $perl->add_match_table('Punct');
11486     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
11487
11488     # \p{punct} doesn't include the symbols, which posix does
11489     my $XPosixPunct = $perl->add_match_table('XPosixPunct',
11490                     Description => '\p{Punct} + ASCII-range \p{Symbol}',
11491                     Initialize => $gc->table('Punctuation')
11492                                 + ($ASCII & $gc->table('Symbol')),
11493         );
11494     $perl->add_match_table('PosixPunct',
11495         Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
11496         Initialize => $ASCII & $XPosixPunct,
11497         );
11498
11499     my $Digit = $perl->add_match_table('Digit',
11500                             Description => '[0-9] + all other decimal digits');
11501     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
11502     $Digit->add_alias('XPosixDigit');
11503     my $PosixDigit = $perl->add_match_table("PosixDigit",
11504                                             Description => '[0-9]',
11505                                             Initialize => $Digit & $ASCII,
11506                                             );
11507
11508     # Hex_Digit was not present in first release
11509     my $Xdigit = $perl->add_match_table('XDigit');
11510     $Xdigit->add_alias('XPosixXDigit');
11511     my $Hex = property_ref('Hex_Digit');
11512     if (defined $Hex && ! $Hex->is_empty) {
11513         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
11514     }
11515     else {
11516         # (Have to use hex instead of e.g. '0', because could be running on an
11517         # non-ASCII machine, and we want the Unicode (ASCII) values)
11518         $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
11519                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
11520         $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
11521     }
11522     $perl->add_match_table('PosixXDigit',
11523                             Initialize => $ASCII & $Xdigit,
11524                             Description => '[0-9A-Fa-f]',
11525                         );
11526
11527     my $dt = property_ref('Decomposition_Type');
11528     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
11529         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
11530         Perl_Extension => 1,
11531         Note => 'Union of all non-canonical decompositions',
11532         );
11533
11534     # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
11535     # than SD appeared, construct it ourselves, based on the first release SD
11536     # was in.
11537     my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
11538     my $soft_dotted = property_ref('Soft_Dotted');
11539     if (defined $soft_dotted && ! $soft_dotted->is_empty) {
11540         $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
11541     }
11542     else {
11543
11544         # This list came from 3.2 Soft_Dotted.
11545         $CanonDCIJ->initialize([ 0x0069,
11546                                  0x006A,
11547                                  0x012F,
11548                                  0x0268,
11549                                  0x0456,
11550                                  0x0458,
11551                                  0x1E2D,
11552                                  0x1ECB,
11553                                ]);
11554         $CanonDCIJ = $CanonDCIJ & $Assigned;
11555     }
11556
11557     # These are used in Unicode's definition of \X
11558     my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
11559     my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
11560
11561     my $gcb = property_ref('Grapheme_Cluster_Break');
11562
11563     # The 'extended' grapheme cluster came in 5.1.  The non-extended
11564     # definition differs too much from the traditional Perl one to use.
11565     if (defined $gcb && defined $gcb->table('SpacingMark')) {
11566
11567         # Note that assumes HST is defined; it came in an earlier release than
11568         # GCB.  In the line below, two negatives means: yes hangul
11569         $begin += ~ property_ref('Hangul_Syllable_Type')
11570                                                     ->table('Not_Applicable')
11571                + ~ ($gcb->table('Control')
11572                     + $gcb->table('CR')
11573                     + $gcb->table('LF'));
11574         $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
11575
11576         $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
11577         $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
11578     }
11579     else {    # Old definition, used on early releases.
11580         $extend += $gc->table('Mark')
11581                 + 0x200C    # ZWNJ
11582                 + 0x200D;   # ZWJ
11583         $begin += ~ $extend;
11584
11585         # Here we may have a release that has the regular grapheme cluster
11586         # defined, or a release that doesn't have anything defined.
11587         # We set things up so the Perl core degrades gracefully, possibly with
11588         # placeholders that match nothing.
11589
11590         if (! defined $gcb) {
11591             $gcb = Property->new('GCB', Status => $PLACEHOLDER);
11592         }
11593         my $hst = property_ref('HST');
11594         if (!defined $hst) {
11595             $hst = Property->new('HST', Status => $PLACEHOLDER);
11596             $hst->add_match_table('Not_Applicable',
11597                                 Initialize => $Any,
11598                                 Matches_All => 1);
11599         }
11600
11601         # On some releases, here we may not have the needed tables for the
11602         # perl core, in some releases we may.
11603         foreach my $name (qw{ L LV LVT T V prepend }) {
11604             my $table = $gcb->table($name);
11605             if (! defined $table) {
11606                 $table = $gcb->add_match_table($name);
11607                 push @tables_that_may_be_empty, $table->complete_name;
11608             }
11609
11610             # The HST property predates the GCB one, and has identical tables
11611             # for some of them, so use it if we can.
11612             if ($table->is_empty
11613                 && defined $hst
11614                 && defined $hst->table($name))
11615             {
11616                 $table += $hst->table($name);
11617             }
11618         }
11619     }
11620
11621     # More GCB.  If we found some hangul syllables, populate a combined
11622     # table.
11623     my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
11624     my $LV = $gcb->table('LV');
11625     if ($LV->is_empty) {
11626         push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
11627     } else {
11628         $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
11629         $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
11630     }
11631
11632     # Was previously constructed to contain both Name and Unicode_1_Name
11633     my @composition = ('Name', 'Unicode_1_Name');
11634
11635     if (@named_sequences) {
11636         push @composition, 'Named_Sequence';
11637         foreach my $sequence (@named_sequences) {
11638             $perl_charname->add_anomalous_entry($sequence);
11639         }
11640     }
11641
11642     my $alias_sentence = "";
11643     my $alias = property_ref('Name_Alias');
11644     if (defined $alias) {
11645         push @composition, 'Name_Alias';
11646         $alias->reset_each_range;
11647         while (my ($range) = $alias->each_range) {
11648             next if $range->value eq "";
11649             if ($range->start != $range->end) {
11650                 Carp::my_carp("Expecting only one code point in the range $range.  Just to keep going, using just the first code point;");
11651             }
11652             $perl_charname->add_duplicate($range->start, $range->value);
11653         }
11654         $alias_sentence = <<END;
11655 The Name_Alias property adds duplicate code point entries with a corrected
11656 name.  The original (less correct, but still valid) name will be physically
11657 last.
11658 END
11659     }
11660     my $comment;
11661     if (@composition <= 2) { # Always at least 2
11662         $comment = join " and ", @composition;
11663     }
11664     else {
11665         $comment = join ", ", @composition[0 .. scalar @composition - 2];
11666         $comment .= ", and $composition[-1]";
11667     }
11668
11669     $perl_charname->add_comment(join_lines( <<END
11670 This file is for charnames.pm.  It is the union of the $comment properties.
11671 Unicode_1_Name entries are used only for otherwise nameless code
11672 points.
11673 $alias_sentence
11674 END
11675     ));
11676
11677     # The combining class property used by Perl's normalize.pm is not located
11678     # in the normal mapping directory; create a copy for it.
11679     my $ccc = property_ref('Canonical_Combining_Class');
11680     my $perl_ccc = Property->new('Perl_ccc',
11681                             Default_Map => $ccc->default_map,
11682                             Full_Name => 'Perl_Canonical_Combining_Class',
11683                             Internal_Only_Warning => 1,
11684                             Perl_Extension => 1,
11685                             Pod_Entry =>0,
11686                             Type => $ENUM,
11687                             Initialize => $ccc,
11688                             File => 'CombiningClass',
11689                             Directory => File::Spec->curdir(),
11690                             );
11691     $perl_ccc->set_to_output_map(1);
11692     $perl_ccc->add_comment(join_lines(<<END
11693 This mapping is for normalize.pm.  It is currently identical to the Unicode
11694 Canonical_Combining_Class property.
11695 END
11696     ));
11697
11698     # This one match table for it is needed for calculations on output
11699     my $default = $perl_ccc->add_match_table($ccc->default_map,
11700                         Initialize => $ccc->table($ccc->default_map),
11701                         Status => $SUPPRESSED);
11702
11703     # Construct the Present_In property from the Age property.
11704     if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
11705         my $default_map = $age->default_map;
11706         my $in = Property->new('In',
11707                                 Default_Map => $default_map,
11708                                 Full_Name => "Present_In",
11709                                 Internal_Only_Warning => 1,
11710                                 Perl_Extension => 1,
11711                                 Type => $ENUM,
11712                                 Initialize => $age,
11713                                 );
11714         $in->add_comment(join_lines(<<END
11715 This file should not be used for any purpose.  The values in this file are the
11716 same as for $age, and not for what $in really means.  This is because anything
11717 defined in a given release should have multiple values: that release and all
11718 higher ones.  But only one value per code point can be represented in a table
11719 like this.
11720 END
11721         ));
11722
11723         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
11724         # lowest numbered (earliest) come first, with the non-numeric one
11725         # last.
11726         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
11727                                             ? 1
11728                                             : ($b->name !~ /^[\d.]*$/)
11729                                                 ? -1
11730                                                 : $a->name <=> $b->name
11731                                             } $age->tables;
11732
11733         # The Present_In property is the cumulative age properties.  The first
11734         # one hence is identical to the first age one.
11735         my $previous_in = $in->add_match_table($first_age->name);
11736         $previous_in->set_equivalent_to($first_age, Related => 1);
11737
11738         my $description_start = "Code point's usage introduced in version ";
11739         $first_age->add_description($description_start . $first_age->name);
11740
11741         # To construct the accumulated values, for each of the age tables
11742         # starting with the 2nd earliest, merge the earliest with it, to get
11743         # all those code points existing in the 2nd earliest.  Repeat merging
11744         # the new 2nd earliest with the 3rd earliest to get all those existing
11745         # in the 3rd earliest, and so on.
11746         foreach my $current_age (@rest_ages) {
11747             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
11748
11749             my $current_in = $in->add_match_table(
11750                                     $current_age->name,
11751                                     Initialize => $current_age + $previous_in,
11752                                     Description => $description_start
11753                                                     . $current_age->name
11754                                                     . ' or earlier',
11755                                     );
11756             $previous_in = $current_in;
11757
11758             # Add clarifying material for the corresponding age file.  This is
11759             # in part because of the confusing and contradictory information
11760             # given in the Standard's documentation itself, as of 5.2.
11761             $current_age->add_description(
11762                             "Code point's usage was introduced in version "
11763                             . $current_age->name);
11764             $current_age->add_note("See also $in");
11765
11766         }
11767
11768         # And finally the code points whose usages have yet to be decided are
11769         # the same in both properties.  Note that permanently unassigned code
11770         # points actually have their usage assigned (as being permanently
11771         # unassigned), so that these tables are not the same as gc=cn.
11772         my $unassigned = $in->add_match_table($default_map);
11773         my $age_default = $age->table($default_map);
11774         $age_default->add_description(<<END
11775 Code point's usage has not been assigned in any Unicode release thus far.
11776 END
11777         );
11778         $unassigned->set_equivalent_to($age_default, Related => 1);
11779     }
11780
11781
11782     # Finished creating all the perl properties.  All non-internal non-string
11783     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
11784     # an underscore.)  These do not get a separate entry in the pod file
11785     foreach my $table ($perl->tables) {
11786         foreach my $alias ($table->aliases) {
11787             next if $alias->name =~ /^_/;
11788             $table->add_alias('Is_' . $alias->name,
11789                                Pod_Entry => 0,
11790                                Status => $alias->status,
11791                                Externally_Ok => 0);
11792         }
11793     }
11794
11795     # Here done with all the basic stuff.  Ready to populate the information
11796     # about each character if annotating them.
11797     if ($annotate) {
11798
11799         # See comments at its declaration
11800         $annotate_ranges = Range_Map->new;
11801
11802         # This separates out the non-characters from the other unassigneds, so
11803         # can give different annotations for each.
11804         $unassigned_sans_noncharacters = Range_List->new(
11805          Initialize => $gc->table('Unassigned')
11806                        & property_ref('Noncharacter_Code_Point')->table('N'));
11807
11808         for (my $i = 0; $i <= $LAST_UNICODE_CODEPOINT; $i++ ) {
11809             $i = populate_char_info($i);    # Note sets $i so may cause skips
11810         }
11811     }
11812
11813     return;
11814 }
11815
11816 sub add_perl_synonyms() {
11817     # A number of Unicode tables have Perl synonyms that are expressed in
11818     # the single-form, \p{name}.  These are:
11819     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
11820     #       \p{Is_Name} as synonyms
11821     #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
11822     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
11823     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
11824     #       conflict, \p{Value} and \p{Is_Value} as well
11825     #
11826     # This routine generates these synonyms, warning of any unexpected
11827     # conflicts.
11828
11829     # Construct the list of tables to get synonyms for.  Start with all the
11830     # binary and the General_Category ones.
11831     my @tables = grep { $_->type == $BINARY } property_ref('*');
11832     push @tables, $gc->tables;
11833
11834     # If the version of Unicode includes the Script property, add its tables
11835     if (defined property_ref('Script')) {
11836         push @tables, property_ref('Script')->tables;
11837     }
11838
11839     # The Block tables are kept separate because they are treated differently.
11840     # And the earliest versions of Unicode didn't include them, so add only if
11841     # there are some.
11842     my @blocks;
11843     push @blocks, $block->tables if defined $block;
11844
11845     # Here, have the lists of tables constructed.  Process blocks last so that
11846     # if there are name collisions with them, blocks have lowest priority.
11847     # Should there ever be other collisions, manual intervention would be
11848     # required.  See the comments at the beginning of the program for a
11849     # possible way to handle those semi-automatically.
11850     foreach my $table (@tables,  @blocks) {
11851
11852         # For non-binary properties, the synonym is just the name of the
11853         # table, like Greek, but for binary properties the synonym is the name
11854         # of the property, and means the code points in its 'Y' table.
11855         my $nominal = $table;
11856         my $nominal_property = $nominal->property;
11857         my $actual;
11858         if (! $nominal->isa('Property')) {
11859             $actual = $table;
11860         }
11861         else {
11862
11863             # Here is a binary property.  Use the 'Y' table.  Verify that is
11864             # there
11865             my $yes = $nominal->table('Y');
11866             unless (defined $yes) {  # Must be defined, but is permissible to
11867                                      # be empty.
11868                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
11869                 next;
11870             }
11871             $actual = $yes;
11872         }
11873
11874         foreach my $alias ($nominal->aliases) {
11875
11876             # Attempt to create a table in the perl directory for the
11877             # candidate table, using whatever aliases in it that don't
11878             # conflict.  Also add non-conflicting aliases for all these
11879             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
11880             PREFIX:
11881             foreach my $prefix ("", 'Is_', 'In_') {
11882
11883                 # Only Block properties can have added 'In_' aliases.
11884                 next if $prefix eq 'In_' and $nominal_property != $block;
11885
11886                 my $proposed_name = $prefix . $alias->name;
11887
11888                 # No Is_Is, In_In, nor combinations thereof
11889                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
11890                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
11891
11892                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
11893
11894                 # Get a reference to any existing table in the perl
11895                 # directory with the desired name.
11896                 my $pre_existing = $perl->table($proposed_name);
11897
11898                 if (! defined $pre_existing) {
11899
11900                     # No name collision, so ok to add the perl synonym.
11901
11902                     my $make_pod_entry;
11903                     my $externally_ok;
11904                     my $status = $actual->status;
11905                     if ($nominal_property == $block) {
11906
11907                         # For block properties, the 'In' form is preferred for
11908                         # external use; the pod file contains wild cards for
11909                         # this and the 'Is' form so no entries for those; and
11910                         # we don't want people using the name without the
11911                         # 'In', so discourage that.
11912                         if ($prefix eq "") {
11913                             $make_pod_entry = 1;
11914                             $status = $status || $DISCOURAGED;
11915                             $externally_ok = 0;
11916                         }
11917                         elsif ($prefix eq 'In_') {
11918                             $make_pod_entry = 0;
11919                             $status = $status || $NORMAL;
11920                             $externally_ok = 1;
11921                         }
11922                         else {
11923                             $make_pod_entry = 0;
11924                             $status = $status || $DISCOURAGED;
11925                             $externally_ok = 0;
11926                         }
11927                     }
11928                     elsif ($prefix ne "") {
11929
11930                         # The 'Is' prefix is handled in the pod by a wild
11931                         # card, and we won't use it for an external name
11932                         $make_pod_entry = 0;
11933                         $status = $status || $NORMAL;
11934                         $externally_ok = 0;
11935                     }
11936                     else {
11937
11938                         # Here, is an empty prefix, non block.  This gets its
11939                         # own pod entry and can be used for an external name.
11940                         $make_pod_entry = 1;
11941                         $status = $status || $NORMAL;
11942                         $externally_ok = 1;
11943                     }
11944
11945                     # Here, there isn't a perl pre-existing table with the
11946                     # name.  Look through the list of equivalents of this
11947                     # table to see if one is a perl table.
11948                     foreach my $equivalent ($actual->leader->equivalents) {
11949                         next if $equivalent->property != $perl;
11950
11951                         # Here, have found a table for $perl.  Add this alias
11952                         # to it, and are done with this prefix.
11953                         $equivalent->add_alias($proposed_name,
11954                                         Pod_Entry => $make_pod_entry,
11955                                         Status => $status,
11956                                         Externally_Ok => $externally_ok);
11957                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
11958                         next PREFIX;
11959                     }
11960
11961                     # Here, $perl doesn't already have a table that is a
11962                     # synonym for this property, add one.
11963                     my $added_table = $perl->add_match_table($proposed_name,
11964                                             Pod_Entry => $make_pod_entry,
11965                                             Status => $status,
11966                                             Externally_Ok => $externally_ok);
11967                     # And it will be related to the actual table, since it is
11968                     # based on it.
11969                     $added_table->set_equivalent_to($actual, Related => 1);
11970                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
11971                     next;
11972                 } # End of no pre-existing.
11973
11974                 # Here, there is a pre-existing table that has the proposed
11975                 # name.  We could be in trouble, but not if this is just a
11976                 # synonym for another table that we have already made a child
11977                 # of the pre-existing one.
11978                 if ($pre_existing->is_set_equivalent_to($actual)) {
11979                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
11980                     $pre_existing->add_alias($proposed_name);
11981                     next;
11982                 }
11983
11984                 # Here, there is a name collision, but it still could be ok if
11985                 # the tables match the identical set of code points, in which
11986                 # case, we can combine the names.  Compare each table's code
11987                 # point list to see if they are identical.
11988                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
11989                 if ($pre_existing->matches_identically_to($actual)) {
11990
11991                     # Here, they do match identically.  Not a real conflict.
11992                     # Make the perl version a child of the Unicode one, except
11993                     # in the non-obvious case of where the perl name is
11994                     # already a synonym of another Unicode property.  (This is
11995                     # excluded by the test for it being its own parent.)  The
11996                     # reason for this exclusion is that then the two Unicode
11997                     # properties become related; and we don't really know if
11998                     # they are or not.  We generate documentation based on
11999                     # relatedness, and this would be misleading.  Code
12000                     # later executed in the process will cause the tables to
12001                     # be represented by a single file anyway, without making
12002                     # it look in the pod like they are necessarily related.
12003                     if ($pre_existing->parent == $pre_existing
12004                         && ($pre_existing->property == $perl
12005                             || $actual->property == $perl))
12006                     {
12007                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
12008                         $pre_existing->set_equivalent_to($actual, Related => 1);
12009                     }
12010                     elsif (main::DEBUG && $to_trace) {
12011                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
12012                         trace $pre_existing->parent;
12013                     }
12014                     next PREFIX;
12015                 }
12016
12017                 # Here they didn't match identically, there is a real conflict
12018                 # between our new name and a pre-existing property.
12019                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
12020                 $pre_existing->add_conflicting($nominal->full_name,
12021                                                'p',
12022                                                $actual);
12023
12024                 # Don't output a warning for aliases for the block
12025                 # properties (unless they start with 'In_') as it is
12026                 # expected that there will be conflicts and the block
12027                 # form loses.
12028                 if ($verbosity >= $NORMAL_VERBOSITY
12029                     && ($actual->property != $block || $prefix eq 'In_'))
12030                 {
12031                     print simple_fold(join_lines(<<END
12032 There is already an alias named $proposed_name (from " . $pre_existing . "),
12033 so not creating this alias for " . $actual
12034 END
12035                     ), "", 4);
12036                 }
12037
12038                 # Keep track for documentation purposes.
12039                 $has_In_conflicts++ if $prefix eq 'In_';
12040                 $has_Is_conflicts++ if $prefix eq 'Is_';
12041             }
12042         }
12043     }
12044
12045     # There are some properties which have No and Yes (and N and Y) as
12046     # property values, but aren't binary, and could possibly be confused with
12047     # binary ones.  So create caveats for them.  There are tables that are
12048     # named 'No', and tables that are named 'N', but confusion is not likely
12049     # unless they are the same table.  For example, N meaning Number or
12050     # Neutral is not likely to cause confusion, so don't add caveats to things
12051     # like them.
12052     foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
12053         my $yes = $property->table('Yes');
12054         if (defined $yes) {
12055             my $y = $property->table('Y');
12056             if (defined $y && $yes == $y) {
12057                 foreach my $alias ($property->aliases) {
12058                     $yes->add_conflicting($alias->name);
12059                 }
12060             }
12061         }
12062         my $no = $property->table('No');
12063         if (defined $no) {
12064             my $n = $property->table('N');
12065             if (defined $n && $no == $n) {
12066                 foreach my $alias ($property->aliases) {
12067                     $no->add_conflicting($alias->name, 'P');
12068                 }
12069             }
12070         }
12071     }
12072
12073     return;
12074 }
12075
12076 sub register_file_for_name($$$) {
12077     # Given info about a table and a datafile that it should be associated
12078     # with, register that association
12079
12080     my $table = shift;
12081     my $directory_ref = shift;   # Array of the directory path for the file
12082     my $file = shift;            # The file name in the final directory, [-1].
12083     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12084
12085     trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
12086
12087     if ($table->isa('Property')) {
12088         $table->set_file_path(@$directory_ref, $file);
12089         push @map_properties, $table
12090                                     if $directory_ref->[0] eq $map_directory;
12091         return;
12092     }
12093
12094     # Do all of the work for all equivalent tables when called with the leader
12095     # table, so skip if isn't the leader.
12096     return if $table->leader != $table;
12097
12098     # Join all the file path components together, using slashes.
12099     my $full_filename = join('/', @$directory_ref, $file);
12100
12101     # All go in the same subdirectory of unicore
12102     if ($directory_ref->[0] ne $matches_directory) {
12103         Carp::my_carp("Unexpected directory in "
12104                 .  join('/', @{$directory_ref}, $file));
12105     }
12106
12107     # For this table and all its equivalents ...
12108     foreach my $table ($table, $table->equivalents) {
12109
12110         # Associate it with its file internally.  Don't include the
12111         # $matches_directory first component
12112         $table->set_file_path(@$directory_ref, $file);
12113         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
12114
12115         my $property = $table->property;
12116         $property = ($property == $perl)
12117                     ? ""                # 'perl' is never explicitly stated
12118                     : standardize($property->name) . '=';
12119
12120         my $deprecated = ($table->status eq $DEPRECATED)
12121                          ? $table->status_info
12122                          : "";
12123
12124         # And for each of the table's aliases...  This inner loop eventually
12125         # goes through all aliases in the UCD that we generate regex match
12126         # files for
12127         foreach my $alias ($table->aliases) {
12128             my $name = $alias->name;
12129
12130             # Generate an entry in either the loose or strict hashes, which
12131             # will translate the property and alias names combination into the
12132             # file where the table for them is stored.
12133             my $standard;
12134             if ($alias->loose_match) {
12135                 $standard = $property . standardize($alias->name);
12136                 if (exists $loose_to_file_of{$standard}) {
12137                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
12138                 }
12139                 else {
12140                     $loose_to_file_of{$standard} = $sub_filename;
12141                 }
12142             }
12143             else {
12144                 $standard = lc ($property . $name);
12145                 if (exists $stricter_to_file_of{$standard}) {
12146                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
12147                 }
12148                 else {
12149                     $stricter_to_file_of{$standard} = $sub_filename;
12150
12151                     # Tightly coupled with how utf8_heavy.pl works, for a
12152                     # floating point number that is a whole number, get rid of
12153                     # the trailing decimal point and 0's, so that utf8_heavy
12154                     # will work.  Also note that this assumes that such a
12155                     # number is matched strictly; so if that were to change,
12156                     # this would be wrong.
12157                     if ((my $integer_name = $name)
12158                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
12159                     {
12160                         $stricter_to_file_of{$property . $integer_name}
12161                             = $sub_filename;
12162                     }
12163                 }
12164             }
12165
12166             # Keep a list of the deprecated properties and their filenames
12167             if ($deprecated) {
12168                 $utf8::why_deprecated{$sub_filename} = $deprecated;
12169             }
12170         }
12171     }
12172
12173     return;
12174 }
12175
12176 {   # Closure
12177     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
12178                      # conflicts
12179     my %full_dir_name_of;   # Full length names of directories used.
12180
12181     sub construct_filename($$$) {
12182         # Return a file name for a table, based on the table name, but perhaps
12183         # changed to get rid of non-portable characters in it, and to make
12184         # sure that it is unique on a file system that allows the names before
12185         # any period to be at most 8 characters (DOS).  While we're at it
12186         # check and complain if there are any directory conflicts.
12187
12188         my $name = shift;       # The name to start with
12189         my $mutable = shift;    # Boolean: can it be changed?  If no, but
12190                                 # yet it must be to work properly, a warning
12191                                 # is given
12192         my $directories_ref = shift;  # A reference to an array containing the
12193                                 # path to the file, with each element one path
12194                                 # component.  This is used because the same
12195                                 # name can be used in different directories.
12196         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12197
12198         my $warn = ! defined wantarray;  # If true, then if the name is
12199                                 # changed, a warning is issued as well.
12200
12201         if (! defined $name) {
12202             Carp::my_carp("Undefined name in directory "
12203                           . File::Spec->join(@$directories_ref)
12204                           . ". '_' used");
12205             return '_';
12206         }
12207
12208         # Make sure that no directory names conflict with each other.  Look at
12209         # each directory in the input file's path.  If it is already in use,
12210         # assume it is correct, and is merely being re-used, but if we
12211         # truncate it to 8 characters, and find that there are two directories
12212         # that are the same for the first 8 characters, but differ after that,
12213         # then that is a problem.
12214         foreach my $directory (@$directories_ref) {
12215             my $short_dir = substr($directory, 0, 8);
12216             if (defined $full_dir_name_of{$short_dir}) {
12217                 next if $full_dir_name_of{$short_dir} eq $directory;
12218                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
12219             }
12220             else {
12221                 $full_dir_name_of{$short_dir} = $directory;
12222             }
12223         }
12224
12225         my $path = join '/', @$directories_ref;
12226         $path .= '/' if $path;
12227
12228         # Remove interior underscores.
12229         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
12230
12231         # Change any non-word character into an underscore, and truncate to 8.
12232         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
12233         substr($filename, 8) = "" if length($filename) > 8;
12234
12235         # Make sure the basename doesn't conflict with something we
12236         # might have already written. If we have, say,
12237         #     InGreekExtended1
12238         #     InGreekExtended2
12239         # they become
12240         #     InGreekE
12241         #     InGreek2
12242         my $warned = 0;
12243         while (my $num = $base_names{$path}{lc $filename}++) {
12244             $num++; # so basenames with numbers start with '2', which
12245                     # just looks more natural.
12246
12247             # Want to append $num, but if it'll make the basename longer
12248             # than 8 characters, pre-truncate $filename so that the result
12249             # is acceptable.
12250             my $delta = length($filename) + length($num) - 8;
12251             if ($delta > 0) {
12252                 substr($filename, -$delta) = $num;
12253             }
12254             else {
12255                 $filename .= $num;
12256             }
12257             if ($warn && ! $warned) {
12258                 $warned = 1;
12259                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
12260             }
12261         }
12262
12263         return $filename if $mutable;
12264
12265         # If not changeable, must return the input name, but warn if needed to
12266         # change it beyond shortening it.
12267         if ($name ne $filename
12268             && substr($name, 0, length($filename)) ne $filename) {
12269             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
12270         }
12271         return $name;
12272     }
12273 }
12274
12275 # The pod file contains a very large table.  Many of the lines in that table
12276 # would exceed a typical output window's size, and so need to be wrapped with
12277 # a hanging indent to make them look good.  The pod language is really
12278 # insufficient here.  There is no general construct to do that in pod, so it
12279 # is done here by beginning each such line with a space to cause the result to
12280 # be output without formatting, and doing all the formatting here.  This leads
12281 # to the result that if the eventual display window is too narrow it won't
12282 # look good, and if the window is too wide, no advantage is taken of that
12283 # extra width.  A further complication is that the output may be indented by
12284 # the formatter so that there is less space than expected.  What I (khw) have
12285 # done is to assume that that indent is a particular number of spaces based on
12286 # what it is in my Linux system;  people can always resize their windows if
12287 # necessary, but this is obviously less than desirable, but the best that can
12288 # be expected.
12289 my $automatic_pod_indent = 8;
12290
12291 # Try to format so that uses fewest lines, but few long left column entries
12292 # slide into the right column.  An experiment on 5.1 data yielded the
12293 # following percentages that didn't cut into the other side along with the
12294 # associated first-column widths
12295 # 69% = 24
12296 # 80% not too bad except for a few blocks
12297 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
12298 # 95% = 37;
12299 my $indent_info_column = 27;    # 75% of lines didn't have overlap
12300
12301 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
12302                     # The 3 is because of:
12303                     #   1   for the leading space to tell the pod formatter to
12304                     #       output as-is
12305                     #   1   for the flag
12306                     #   1   for the space between the flag and the main data
12307
12308 sub format_pod_line ($$$;$$) {
12309     # Take a pod line and return it, formatted properly
12310
12311     my $first_column_width = shift;
12312     my $entry = shift;  # Contents of left column
12313     my $info = shift;   # Contents of right column
12314
12315     my $status = shift || "";   # Any flag
12316
12317     my $loose_match = shift;    # Boolean.
12318     $loose_match = 1 unless defined $loose_match;
12319
12320     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12321
12322     my $flags = "";
12323     $flags .= $STRICTER if ! $loose_match;
12324
12325     $flags .= $status if $status;
12326
12327     # There is a blank in the left column to cause the pod formatter to
12328     # output the line as-is.
12329     return sprintf " %-*s%-*s %s\n",
12330                     # The first * in the format is replaced by this, the -1 is
12331                     # to account for the leading blank.  There isn't a
12332                     # hard-coded blank after this to separate the flags from
12333                     # the rest of the line, so that in the unlikely event that
12334                     # multiple flags are shown on the same line, they both
12335                     # will get displayed at the expense of that separation,
12336                     # but since they are left justified, a blank will be
12337                     # inserted in the normal case.
12338                     $FILLER - 1,
12339                     $flags,
12340
12341                     # The other * in the format is replaced by this number to
12342                     # cause the first main column to right fill with blanks.
12343                     # The -1 is for the guaranteed blank following it.
12344                     $first_column_width - $FILLER - 1,
12345                     $entry,
12346                     $info;
12347 }
12348
12349 my @zero_match_tables;  # List of tables that have no matches in this release
12350
12351 sub make_table_pod_entries($) {
12352     # This generates the entries for the pod file for a given table.
12353     # Also done at this time are any children tables.  The output looks like:
12354     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
12355
12356     my $input_table = shift;        # Table the entry is for
12357     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12358
12359     # Generate parent and all its children at the same time.
12360     return if $input_table->parent != $input_table;
12361
12362     my $property = $input_table->property;
12363     my $type = $property->type;
12364     my $full_name = $property->full_name;
12365
12366     my $count = $input_table->count;
12367     my $string_count = clarify_number($count);
12368     my $status = $input_table->status;
12369     my $status_info = $input_table->status_info;
12370
12371     my $entry_for_first_table; # The entry for the first table output.
12372                            # Almost certainly, it is the parent.
12373
12374     # For each related table (including itself), we will generate a pod entry
12375     # for each name each table goes by
12376     foreach my $table ($input_table, $input_table->children) {
12377
12378         # utf8_heavy.pl cannot deal with null string property values, so don't
12379         # output any.
12380         next if $table->name eq "";
12381
12382         # First, gather all the info that applies to this table as a whole.
12383
12384         push @zero_match_tables, $table if $count == 0;
12385
12386         my $table_property = $table->property;
12387
12388         # The short name has all the underscores removed, while the full name
12389         # retains them.  Later, we decide whether to output a short synonym
12390         # for the full one, we need to compare apples to apples, so we use the
12391         # short name's length including underscores.
12392         my $table_property_short_name_length;
12393         my $table_property_short_name
12394             = $table_property->short_name(\$table_property_short_name_length);
12395         my $table_property_full_name = $table_property->full_name;
12396
12397         # Get how much savings there is in the short name over the full one
12398         # (delta will always be <= 0)
12399         my $table_property_short_delta = $table_property_short_name_length
12400                                          - length($table_property_full_name);
12401         my @table_description = $table->description;
12402         my @table_note = $table->note;
12403
12404         # Generate an entry for each alias in this table.
12405         my $entry_for_first_alias;  # saves the first one encountered.
12406         foreach my $alias ($table->aliases) {
12407
12408             # Skip if not to go in pod.
12409             next unless $alias->make_pod_entry;
12410
12411             # Start gathering all the components for the entry
12412             my $name = $alias->name;
12413
12414             my $entry;      # Holds the left column, may include extras
12415             my $entry_ref;  # To refer to the left column's contents from
12416                             # another entry; has no extras
12417
12418             # First the left column of the pod entry.  Tables for the $perl
12419             # property always use the single form.
12420             if ($table_property == $perl) {
12421                 $entry = "\\p{$name}";
12422                 $entry_ref = "\\p{$name}";
12423             }
12424             else {    # Compound form.
12425
12426                 # Only generate one entry for all the aliases that mean true
12427                 # or false in binary properties.  Append a '*' to indicate
12428                 # some are missing.  (The heading comment notes this.)
12429                 my $wild_card_mark;
12430                 if ($type == $BINARY) {
12431                     next if $name ne 'N' && $name ne 'Y';
12432                     $wild_card_mark = '*';
12433                 }
12434                 else {
12435                     $wild_card_mark = "";
12436                 }
12437
12438                 # Colon-space is used to give a little more space to be easier
12439                 # to read;
12440                 $entry = "\\p{"
12441                         . $table_property_full_name
12442                         . ": $name$wild_card_mark}";
12443
12444                 # But for the reference to this entry, which will go in the
12445                 # right column, where space is at a premium, use equals
12446                 # without a space
12447                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
12448             }
12449
12450             # Then the right (info) column.  This is stored as components of
12451             # an array for the moment, then joined into a string later.  For
12452             # non-internal only properties, begin the info with the entry for
12453             # the first table we encountered (if any), as things are ordered
12454             # so that that one is the most descriptive.  This leads to the
12455             # info column of an entry being a more descriptive version of the
12456             # name column
12457             my @info;
12458             if ($name =~ /^_/) {
12459                 push @info,
12460                         '(For internal use by Perl, not necessarily stable)';
12461             }
12462             elsif ($entry_for_first_alias) {
12463                 push @info, $entry_for_first_alias;
12464             }
12465
12466             # If this entry is equivalent to another, add that to the info,
12467             # using the first such table we encountered
12468             if ($entry_for_first_table) {
12469                 if (@info) {
12470                     push @info, "(= $entry_for_first_table)";
12471                 }
12472                 else {
12473                     push @info, $entry_for_first_table;
12474                 }
12475             }
12476
12477             # If the name is a large integer, add an equivalent with an
12478             # exponent for better readability
12479             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
12480                 push @info, sprintf "(= %.1e)", $name
12481             }
12482
12483             my $parenthesized = "";
12484             if (! $entry_for_first_alias) {
12485
12486                 # This is the first alias for the current table.  The alias
12487                 # array is ordered so that this is the fullest, most
12488                 # descriptive alias, so it gets the fullest info.  The other
12489                 # aliases are mostly merely pointers to this one, using the
12490                 # information already added above.
12491
12492                 # Display any status message, but only on the parent table
12493                 if ($status && ! $entry_for_first_table) {
12494                     push @info, $status_info;
12495                 }
12496
12497                 # Put out any descriptive info
12498                 if (@table_description || @table_note) {
12499                     push @info, join "; ", @table_description, @table_note;
12500                 }
12501
12502                 # Look to see if there is a shorter name we can point people
12503                 # at
12504                 my $standard_name = standardize($name);
12505                 my $short_name;
12506                 my $proposed_short = $table->short_name;
12507                 if (defined $proposed_short) {
12508                     my $standard_short = standardize($proposed_short);
12509
12510                     # If the short name is shorter than the standard one, or
12511                     # even it it's not, but the combination of it and its
12512                     # short property name (as in \p{prop=short} ($perl doesn't
12513                     # have this form)) saves at least two characters, then,
12514                     # cause it to be listed as a shorter synonym.
12515                     if (length $standard_short < length $standard_name
12516                         || ($table_property != $perl
12517                             && (length($standard_short)
12518                                 - length($standard_name)
12519                                 + $table_property_short_delta)  # (<= 0)
12520                                 < -2))
12521                     {
12522                         $short_name = $proposed_short;
12523                         if ($table_property != $perl) {
12524                             $short_name = $table_property_short_name
12525                                           . "=$short_name";
12526                         }
12527                         $short_name = "\\p{$short_name}";
12528                     }
12529                 }
12530
12531                 # And if this is a compound form name, see if there is a
12532                 # single form equivalent
12533                 my $single_form;
12534                 if ($table_property != $perl) {
12535
12536                     # Special case the binary N tables, so that will print
12537                     # \P{single}, but use the Y table values to populate
12538                     # 'single', as we haven't populated the N table.
12539                     my $test_table;
12540                     my $p;
12541                     if ($type == $BINARY
12542                         && $input_table == $property->table('No'))
12543                     {
12544                         $test_table = $property->table('Yes');
12545                         $p = 'P';
12546                     }
12547                     else {
12548                         $test_table = $input_table;
12549                         $p = 'p';
12550                     }
12551
12552                     # Look for a single form amongst all the children.
12553                     foreach my $table ($test_table->children) {
12554                         next if $table->property != $perl;
12555                         my $proposed_name = $table->short_name;
12556                         next if ! defined $proposed_name;
12557
12558                         # Don't mention internal-only properties as a possible
12559                         # single form synonym
12560                         next if substr($proposed_name, 0, 1) eq '_';
12561
12562                         $proposed_name = "\\$p\{$proposed_name}";
12563                         if (! defined $single_form
12564                             || length($proposed_name) < length $single_form)
12565                         {
12566                             $single_form = $proposed_name;
12567
12568                             # The goal here is to find a single form; not the
12569                             # shortest possible one.  We've already found a
12570                             # short name.  So, stop at the first single form
12571                             # found, which is likely to be closer to the
12572                             # original.
12573                             last;
12574                         }
12575                     }
12576                 }
12577
12578                 # Ouput both short and single in the same parenthesized
12579                 # expression, but with only one of 'Single', 'Short' if there
12580                 # are both items.
12581                 if ($short_name || $single_form || $table->conflicting) {
12582                     $parenthesized .= '(';
12583                     $parenthesized .= "Short: $short_name" if $short_name;
12584                     if ($short_name && $single_form) {
12585                         $parenthesized .= ', ';
12586                     }
12587                     elsif ($single_form) {
12588                         $parenthesized .= 'Single: ';
12589                     }
12590                     $parenthesized .= $single_form if $single_form;
12591                 }
12592             }
12593
12594
12595             # Warn if this property isn't the same as one that a
12596             # semi-casual user might expect.  The other components of this
12597             # parenthesized structure are calculated only for the first entry
12598             # for this table, but the conflicting is deemed important enough
12599             # to go on every entry.
12600             my $conflicting = join " NOR ", $table->conflicting;
12601             if ($conflicting) {
12602                 $parenthesized .= '(' if ! $parenthesized;
12603                 $parenthesized .=  '; ' if $parenthesized ne '(';
12604                 $parenthesized .= "NOT $conflicting";
12605             }
12606             $parenthesized .= ')' if $parenthesized;
12607
12608             push @info, $parenthesized if $parenthesized;
12609
12610             if ($table_property != $perl && $table->perl_extension) {
12611                 push @info, '(Perl extension)';
12612             }
12613             push @info, "($string_count)";
12614
12615             # Now, we have both the entry and info so add them to the
12616             # list of all the properties.
12617             push @match_properties,
12618                 format_pod_line($indent_info_column,
12619                                 $entry,
12620                                 join( " ", @info),
12621                                 $alias->status,
12622                                 $alias->loose_match);
12623
12624             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
12625         } # End of looping through the aliases for this table.
12626
12627         if (! $entry_for_first_table) {
12628             $entry_for_first_table = $entry_for_first_alias;
12629         }
12630     } # End of looping through all the related tables
12631     return;
12632 }
12633
12634 sub pod_alphanumeric_sort {
12635     # Sort pod entries alphanumerically.
12636
12637     # The first few character columns are filler, plus the '\p{'; and get rid
12638     # of all the trailing stuff, starting with the trailing '}', so as to sort
12639     # on just 'Name=Value'
12640     (my $a = lc $a) =~ s/^ .*? { //x;
12641     $a =~ s/}.*//;
12642     (my $b = lc $b) =~ s/^ .*? { //x;
12643     $b =~ s/}.*//;
12644
12645     # Determine if the two operands are both internal only or both not.
12646     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
12647     # should be the underscore that begins internal only
12648     my $a_is_internal = (substr($a, 0, 1) eq '_');
12649     my $b_is_internal = (substr($b, 0, 1) eq '_');
12650
12651     # Sort so the internals come last in the table instead of first (which the
12652     # leading underscore would otherwise indicate).
12653     if ($a_is_internal != $b_is_internal) {
12654         return 1 if $a_is_internal;
12655         return -1
12656     }
12657
12658     # Determine if the two operands are numeric property values or not.
12659     # A numeric property will look like xyz: 3.  But the number
12660     # can begin with an optional minus sign, and may have a
12661     # fraction or rational component, like xyz: 3/2.  If either
12662     # isn't numeric, use alphabetic sort.
12663     my ($a_initial, $a_number) =
12664         ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
12665     return $a cmp $b unless defined $a_number;
12666     my ($b_initial, $b_number) =
12667         ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
12668     return $a cmp $b unless defined $b_number;
12669
12670     # Here they are both numeric, but use alphabetic sort if the
12671     # initial parts don't match
12672     return $a cmp $b if $a_initial ne $b_initial;
12673
12674     # Convert rationals to floating for the comparison.
12675     $a_number = eval $a_number if $a_number =~ qr{/};
12676     $b_number = eval $b_number if $b_number =~ qr{/};
12677
12678     return $a_number <=> $b_number;
12679 }
12680
12681 sub make_pod () {
12682     # Create the .pod file.  This generates the various subsections and then
12683     # combines them in one big HERE document.
12684
12685     return unless defined $pod_directory;
12686     print "Making pod file\n" if $verbosity >= $PROGRESS;
12687
12688     my $exception_message =
12689     '(Any exceptions are individually noted beginning with the word NOT.)';
12690     my @block_warning;
12691     if (-e 'Blocks.txt') {
12692
12693         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
12694         # if the global $has_In_conflicts indicates we have them.
12695         push @match_properties, format_pod_line($indent_info_column,
12696                                                 '\p{In_*}',
12697                                                 '\p{Block: *}'
12698                                                     . (($has_In_conflicts)
12699                                                       ? " $exception_message"
12700                                                       : ""));
12701         @block_warning = << "END";
12702
12703 Matches in the Block property have shortcuts that begin with 'In_'.  For
12704 example, \\p{Block=Latin1} can be written as \\p{In_Latin1}.  For backward
12705 compatibility, if there is no conflict with another shortcut, these may also
12706 be written as \\p{Latin1} or \\p{Is_Latin1}.  But, N.B., there are numerous
12707 such conflicting shortcuts.  Use of these forms for Block is discouraged, and
12708 are flagged as such, not only because of the potential confusion as to what is
12709 meant, but also because a later release of Unicode may preempt the shortcut,
12710 and your program would no longer be correct.  Use the 'In_' form instead to
12711 avoid this, or even more clearly, use the compound form, e.g.,
12712 \\p{blk:latin1}.  See L<perlunicode/"Blocks"> for more information about this.
12713 END
12714     }
12715     my $text = "If an entry has flag(s) at its beginning, like '$DEPRECATED', the 'Is_' form has the same flag(s)";
12716     $text = "$exception_message $text" if $has_Is_conflicts;
12717
12718     # And the 'Is_ line';
12719     push @match_properties, format_pod_line($indent_info_column,
12720                                             '\p{Is_*}',
12721                                             "\\p{*} $text");
12722
12723     # Sort the properties array for output.  It is sorted alphabetically
12724     # except numerically for numeric properties, and only output unique lines.
12725     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
12726
12727     my $formatted_properties = simple_fold(\@match_properties,
12728                                         "",
12729                                         # indent succeeding lines by two extra
12730                                         # which looks better
12731                                         $indent_info_column + 2,
12732
12733                                         # shorten the line length by how much
12734                                         # the formatter indents, so the folded
12735                                         # line will fit in the space
12736                                         # presumably available
12737                                         $automatic_pod_indent);
12738     # Add column headings, indented to be a little more centered, but not
12739     # exactly
12740     $formatted_properties =  format_pod_line($indent_info_column,
12741                                                     '    NAME',
12742                                                     '           INFO')
12743                                     . "\n"
12744                                     . $formatted_properties;
12745
12746     # Generate pod documentation lines for the tables that match nothing
12747     my $zero_matches;
12748     if (@zero_match_tables) {
12749         @zero_match_tables = uniques(@zero_match_tables);
12750         $zero_matches = join "\n\n",
12751                         map { $_ = '=item \p{' . $_->complete_name . "}" }
12752                             sort { $a->complete_name cmp $b->complete_name }
12753                             uniques(@zero_match_tables);
12754
12755         $zero_matches = <<END;
12756
12757 =head2 Legal \\p{} and \\P{} constructs that match no characters
12758
12759 Unicode has some property-value pairs that currently don't match anything.
12760 This happens generally either because they are obsolete, or for symmetry with
12761 other forms, but no language has yet been encoded that uses them.  In this
12762 version of Unicode, the following match zero code points:
12763
12764 =over 4
12765
12766 $zero_matches
12767
12768 =back
12769
12770 END
12771     }
12772
12773     # Generate list of properties that we don't accept, grouped by the reasons
12774     # why.  This is so only put out the 'why' once, and then list all the
12775     # properties that have that reason under it.
12776
12777     my %why_list;   # The keys are the reasons; the values are lists of
12778                     # properties that have the key as their reason
12779
12780     # For each property, add it to the list that are suppressed for its reason
12781     # The sort will cause the alphabetically first properties to be added to
12782     # each list first, so each list will be sorted.
12783     foreach my $property (sort keys %why_suppressed) {
12784         push @{$why_list{$why_suppressed{$property}}}, $property;
12785     }
12786
12787     # For each reason (sorted by the first property that has that reason)...
12788     my @bad_re_properties;
12789     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
12790                      keys %why_list)
12791     {
12792         # Add to the output, all the properties that have that reason.  Start
12793         # with an empty line.
12794         push @bad_re_properties, "\n\n";
12795
12796         my $has_item = 0;   # Flag if actually output anything.
12797         foreach my $name (@{$why_list{$why}}) {
12798
12799             # Split compound names into $property and $table components
12800             my $property = $name;
12801             my $table;
12802             if ($property =~ / (.*) = (.*) /x) {
12803                 $property = $1;
12804                 $table = $2;
12805             }
12806
12807             # This release of Unicode may not have a property that is
12808             # suppressed, so don't reference a non-existent one.
12809             $property = property_ref($property);
12810             next if ! defined $property;
12811
12812             # And since this list is only for match tables, don't list the
12813             # ones that don't have match tables.
12814             next if ! $property->to_create_match_tables;
12815
12816             # Find any abbreviation, and turn it into a compound name if this
12817             # is a property=value pair.
12818             my $short_name = $property->name;
12819             $short_name .= '=' . $property->table($table)->name if $table;
12820
12821             # And add the property as an item for the reason.
12822             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
12823             $has_item = 1;
12824         }
12825
12826         # And add the reason under the list of properties, if such a list
12827         # actually got generated.  Note that the header got added
12828         # unconditionally before.  But pod ignores extra blank lines, so no
12829         # harm.
12830         push @bad_re_properties, "\n$why\n" if $has_item;
12831
12832     } # End of looping through each reason.
12833
12834     # Generate a list of the properties whose map table we output, from the
12835     # global @map_properties.
12836     my @map_tables_actually_output;
12837     my $info_indent = 20;       # Left column is narrower than \p{} table.
12838     foreach my $property (@map_properties) {
12839
12840         # Get the path to the file; don't output any not in the standard
12841         # directory.
12842         my @path = $property->file_path;
12843         next if $path[0] ne $map_directory;
12844         shift @path;    # Remove the standard name
12845
12846         my $file = join '/', @path; # In case is in sub directory
12847         my $info = $property->full_name;
12848         my $short_name = $property->name;
12849         if ($info ne $short_name) {
12850             $info .= " ($short_name)";
12851         }
12852         foreach my $more_info ($property->description,
12853                                $property->note,
12854                                $property->status_info)
12855         {
12856             next unless $more_info;
12857             $info =~ s/\.\Z//;
12858             $info .= ".  $more_info";
12859         }
12860         push @map_tables_actually_output, format_pod_line($info_indent,
12861                                                           $file,
12862                                                           $info,
12863                                                           $property->status);
12864     }
12865
12866     # Sort alphabetically, and fold for output
12867     @map_tables_actually_output = sort
12868                             pod_alphanumeric_sort @map_tables_actually_output;
12869     @map_tables_actually_output
12870                         = simple_fold(\@map_tables_actually_output,
12871                                         ' ',
12872                                         $info_indent,
12873                                         $automatic_pod_indent);
12874
12875     # Generate a list of the formats that can appear in the map tables.
12876     my @map_table_formats;
12877     foreach my $format (sort keys %map_table_formats) {
12878         push @map_table_formats, " $format    $map_table_formats{$format}\n";
12879     }
12880
12881     # Everything is ready to assemble.
12882     my @OUT = << "END";
12883 =begin comment
12884
12885 $HEADER
12886
12887 To change this file, edit $0 instead.
12888
12889 =end comment
12890
12891 =head1 NAME
12892
12893 $pod_file - Index of Unicode Version $string_version properties in Perl
12894
12895 =head1 DESCRIPTION
12896
12897 There are many properties in Unicode, and Perl provides access to almost all of
12898 them, as well as some additional extensions and short-cut synonyms.
12899
12900 And just about all of the few that aren't accessible through the Perl
12901 core are accessible through the modules: Unicode::Normalize and
12902 Unicode::UCD, and for Unihan properties, via the CPAN module Unicode::Unihan.
12903
12904 This document merely lists all available properties and does not attempt to
12905 explain what each property really means.  There is a brief description of each
12906 Perl extension.  There is some detail about Blocks, Scripts, General_Category,
12907 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
12908 Unicode properties, refer to the Unicode standard.  A good starting place is
12909 L<$unicode_reference_url>.  More information on the Perl extensions is in
12910 L<perlrecharclass>.
12911
12912 Note that you can define your own properties; see
12913 L<perlunicode/"User-Defined Character Properties">.
12914
12915 =head1 Properties accessible through \\p{} and \\P{}
12916
12917 The Perl regular expression \\p{} and \\P{} constructs give access to most of
12918 the Unicode character properties.  The table below shows all these constructs,
12919 both single and compound forms.
12920
12921 B<Compound forms> consist of two components, separated by an equals sign or a
12922 colon.  The first component is the property name, and the second component is
12923 the particular value of the property to match against, for example,
12924 '\\p{Script: Greek}' and '\\p{Script=Greek}' both mean to match characters
12925 whose Script property is Greek.
12926
12927 B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
12928 their equivalent compound forms.  The table shows these equivalences.  (In our
12929 example, '\\p{Greek}' is a just a shortcut for '\\p{Script=Greek}'.)
12930 There are also a few Perl-defined single forms that are not shortcuts for a
12931 compound form.  One such is \\p{Word}.  These are also listed in the table.
12932
12933 In parsing these constructs, Perl always ignores Upper/lower case differences
12934 everywhere within the {braces}.  Thus '\\p{Greek}' means the same thing as
12935 '\\p{greek}'.  But note that changing the case of the 'p' or 'P' before the
12936 left brace completely changes the meaning of the construct, from "match" (for
12937 '\\p{}') to "doesn't match" (for '\\P{}').  Casing in this document is for
12938 improved legibility.
12939
12940 Also, white space, hyphens, and underscores are also normally ignored
12941 everywhere between the {braces}, and hence can be freely added or removed
12942 even if the C</x> modifier hasn't been specified on the regular expression.
12943 But $a_bold_stricter at the beginning of an entry in the table below
12944 means that tighter (stricter) rules are used for that entry:
12945
12946 =over 4
12947
12948 =item Single form (\\p{name}) tighter rules:
12949
12950 White space, hyphens, and underscores ARE significant
12951 except for:
12952
12953 =over 4
12954
12955 =item * white space adjacent to a non-word character
12956
12957 =item * underscores separating digits in numbers
12958
12959 =back
12960
12961 That means, for example, that you can freely add or remove white space
12962 adjacent to (but within) the braces without affecting the meaning.
12963
12964 =item Compound form (\\p{name=value} or \\p{name:value}) tighter rules:
12965
12966 The tighter rules given above for the single form apply to everything to the
12967 right of the colon or equals; the looser rules still apply to everything to
12968 the left.
12969
12970 That means, for example, that you can freely add or remove white space
12971 adjacent to (but within) the braces and the colon or equal sign.
12972
12973 =back
12974
12975 Some properties are considered obsolete, but still available.  There are
12976 several varieties of obsolescence:
12977
12978 =over 4
12979
12980 =item Obsolete
12981
12982 Properties marked with $a_bold_obsolete in the table are considered
12983 obsolete.
12984
12985 =item Stabilized
12986
12987 Obsolete properties may be stabilized.  Such a determination does not indicate
12988 that the property should or should not be used; instead it is a declaration
12989 that the property will not be maintained nor extended for newly encoded
12990 characters.  Such properties are marked with $a_bold_stabilized in the
12991 table.
12992
12993 =item Deprecated
12994
12995 An obsolete property may be deprecated, perhaps because its original intent
12996 has been replaced by another property or because its specification was somehow
12997 defective.  This means that its use is strongly
12998 discouraged, so much so that a warning will be issued if used, unless the
12999 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
13000 statement.  $A_bold_deprecated flags each such entry in the table, and
13001 the entry there for the longest, most descriptive version of the property will
13002 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
13003 warning, even for properties that aren't officially deprecated by Unicode,
13004 when there used to be characters or code points that were matched by them, but
13005 no longer.  This is to warn you that your program may not work like it did on
13006 earlier Unicode releases.
13007
13008 A deprecated property may be made unavailable in a future Perl version, so it
13009 is best to move away from them.
13010
13011 =back
13012
13013 Some Perl extensions are present for backwards compatibility and are
13014 discouraged from being used, but not obsolete.  $A_bold_discouraged
13015 flags each such entry in the table.
13016
13017 @block_warning
13018
13019 The table below has two columns.  The left column contains the \\p{}
13020 constructs to look up, possibly preceded by the flags mentioned above; and
13021 the right column contains information about them, like a description, or
13022 synonyms.  It shows both the single and compound forms for each property that
13023 has them.  If the left column is a short name for a property, the right column
13024 will give its longer, more descriptive name; and if the left column is the
13025 longest name, the right column will show any equivalent shortest name, in both
13026 single and compound forms if applicable.
13027
13028 The right column will also caution you if a property means something different
13029 than what might normally be expected.
13030
13031 All single forms are Perl extensions; a few compound forms are as well, and
13032 are noted as such.
13033
13034 Numbers in (parentheses) indicate the total number of code points matched by
13035 the property.  For emphasis, those properties that match no code points at all
13036 are listed as well in a separate section following the table.
13037
13038 There is no description given for most non-Perl defined properties (See
13039 $unicode_reference_url for that).
13040
13041 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
13042 combinations.  For example, entries like:
13043
13044  \\p{Gc: *}                                  \\p{General_Category: *}
13045
13046 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
13047 for the latter is also valid for the former.  Similarly,
13048
13049  \\p{Is_*}                                   \\p{*}
13050
13051 means that if and only if, for example, \\p{Foo} exists, then \\p{Is_Foo} and
13052 \\p{IsFoo} are also valid and all mean the same thing.  And similarly,
13053 \\p{Foo=Bar} means the same as \\p{Is_Foo=Bar} and \\p{IsFoo=Bar}.  '*' here
13054 is restricted to something not beginning with an underscore.
13055
13056 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
13057 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
13058 'N*' to indicate this, and doesn't have separate entries for the other
13059 possibilities.  Note that not all properties which have values 'Yes' and 'No'
13060 are binary, and they have all their values spelled out without using this wild
13061 card, and a C<NOT> clause in their description that highlights their not being
13062 binary.  These also require the compound form to match them, whereas true
13063 binary properties have both single and compound forms available.
13064
13065 Note that all non-essential underscores are removed in the display of the
13066 short names below.
13067
13068 B<Summary legend:>
13069
13070 =over 4
13071
13072 =item B<*> is a wild-card
13073
13074 =item B<(\\d+)> in the info column gives the number of code points matched by
13075 this property.
13076
13077 =item B<$DEPRECATED> means this is deprecated.
13078
13079 =item B<$OBSOLETE> means this is obsolete.
13080
13081 =item B<$STABILIZED> means this is stabilized.
13082
13083 =item B<$STRICTER> means tighter (stricter) name matching applies.
13084
13085 =item B<$DISCOURAGED> means use of this form is discouraged.
13086
13087 =back
13088
13089 $formatted_properties
13090
13091 $zero_matches
13092
13093 =head1 Properties not accessible through \\p{} and \\P{}
13094
13095 A few properties are accessible in Perl via various function calls only.
13096 These are:
13097  Lowercase_Mapping          lc() and lcfirst()
13098  Titlecase_Mapping          ucfirst()
13099  Uppercase_Mapping          uc()
13100
13101 Case_Folding is accessible through the /i modifier in regular expressions.
13102
13103 The Name property is accessible through the \\N{} interpolation in
13104 double-quoted strings and regular expressions, but both usages require a C<use
13105 charnames;> to be specified, which also contains related functions viacode(),
13106 vianame(), and string_vianame().
13107
13108 =head1 Unicode regular expression properties that are NOT accepted by Perl
13109
13110 Perl will generate an error for a few character properties in Unicode when
13111 used in a regular expression.  The non-Unihan ones are listed below, with the
13112 reasons they are not accepted, perhaps with work-arounds.  The short names for
13113 the properties are listed enclosed in (parentheses).
13114
13115 =over 4
13116
13117 @bad_re_properties
13118
13119 =back
13120
13121 An installation can choose to allow any of these to be matched by changing the
13122 controlling lists contained in the program
13123 C<\$Config{privlib}>/F<unicore/mktables> and then re-running F<mktables>.
13124 (C<\%Config> is available from the Config module).
13125
13126 =head1 Files in the I<To> directory (for serious hackers only)
13127
13128 All Unicode properties are really mappings (in the mathematical sense) from
13129 code points to their respective values.  As part of its build process,
13130 Perl constructs tables containing these mappings for all properties that it
13131 deals with.  But only a few of these are written out into files.
13132 Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
13133 (%Config is available from the Config module).
13134
13135 Those ones written are ones needed by Perl internally during execution, or for
13136 which there is some demand, and those for which there is no access through the
13137 Perl core.  Generally, properties that can be used in regular expression
13138 matching do not have their map tables written, like Script.  Nor are the
13139 simplistic properties that have a better, more complete version, such as
13140 Simple_Uppercase_Mapping  (Uppercase_Mapping is written instead).
13141
13142 None of the properties in the I<To> directory are currently directly
13143 accessible through the Perl core, although some may be accessed indirectly.
13144 For example, the uc() function implements the Uppercase_Mapping property and
13145 uses the F<Upper.pl> file found in this directory.
13146
13147 The available files in the current installation, with their properties (short
13148 names in parentheses), and any flags or comments about them, are:
13149
13150 @map_tables_actually_output
13151
13152 An installation can choose to change which files are generated by changing the
13153 controlling lists contained in the program
13154 C<\$Config{privlib}>/F<unicore/mktables> and then re-running F<mktables>.
13155
13156 Each of these files defines two hash entries to help reading programs decipher
13157 it.  One of them looks like this:
13158
13159     \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
13160
13161 where 'NAME' is a name to indicate the property.  For backwards compatibility,
13162 this is not necessarily the property's official Unicode name.  (The 'To' is
13163 also for backwards compatibility.)  The hash entry gives the format of the
13164 mapping fields of the table, currently one of the following:
13165
13166  @map_table_formats
13167
13168 This format applies only to the entries in the main body of the table.
13169 Entries defined in hashes or ones that are missing from the list can have a
13170 different format.
13171
13172 The value that the missing entries have is given by the other SwashInfo hash
13173 entry line; it looks like this:
13174
13175     \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
13176
13177 This example line says that any Unicode code points not explicitly listed in
13178 the file have the value 'NaN' under the property indicated by NAME.  If the
13179 value is the special string C<< <code point> >>, it means that the value for
13180 any missing code point is the code point itself.  This happens, for example,
13181 in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
13182 character 'A', are missing because the uppercase of 'A' is itself.
13183
13184 =head1 SEE ALSO
13185
13186 L<$unicode_reference_url>
13187
13188 L<perlrecharclass>
13189
13190 L<perlunicode>
13191
13192 END
13193
13194     # And write it.  The 0 means no utf8.
13195     main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
13196     return;
13197 }
13198
13199 sub make_Heavy () {
13200     # Create and write Heavy.pl, which passes info about the tables to
13201     # utf8_heavy.pl
13202
13203     my @heavy = <<END;
13204 $HEADER
13205 $INTERNAL_ONLY
13206
13207 # This file is for the use of utf8_heavy.pl
13208
13209 # Maps property names in loose standard form to its standard name
13210 \%utf8::loose_property_name_of = (
13211 END
13212
13213     push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
13214     push @heavy, <<END;
13215 );
13216
13217 # Maps property, table to file for those using stricter matching
13218 \%utf8::stricter_to_file_of = (
13219 END
13220     push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
13221     push @heavy, <<END;
13222 );
13223
13224 # Maps property, table to file for those using loose matching
13225 \%utf8::loose_to_file_of = (
13226 END
13227     push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
13228     push @heavy, <<END;
13229 );
13230
13231 # Maps floating point to fractional form
13232 \%utf8::nv_floating_to_rational = (
13233 END
13234     push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
13235     push @heavy, <<END;
13236 );
13237
13238 # If a floating point number doesn't have enough digits in it to get this
13239 # close to a fraction, it isn't considered to be that fraction even if all the
13240 # digits it does have match.
13241 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
13242
13243 # Deprecated tables to generate a warning for.  The key is the file containing
13244 # the table, so as to avoid duplication, as many property names can map to the
13245 # file, but we only need one entry for all of them.
13246 \%utf8::why_deprecated = (
13247 END
13248
13249     push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
13250     push @heavy, <<END;
13251 );
13252
13253 1;
13254 END
13255
13256     main::write("Heavy.pl", 0, \@heavy);  # The 0 means no utf8.
13257     return;
13258 }
13259
13260 sub write_all_tables() {
13261     # Write out all the tables generated by this program to files, as well as
13262     # the supporting data structures, pod file, and .t file.
13263
13264     my @writables;              # List of tables that actually get written
13265     my %match_tables_to_write;  # Used to collapse identical match tables
13266                                 # into one file.  Each key is a hash function
13267                                 # result to partition tables into buckets.
13268                                 # Each value is an array of the tables that
13269                                 # fit in the bucket.
13270
13271     # For each property ...
13272     # (sort so that if there is an immutable file name, it has precedence, so
13273     # some other property can't come in and take over its file name.  If b's
13274     # file name is defined, will return 1, meaning to take it first; don't
13275     # care if both defined, as they had better be different anyway)
13276     PROPERTY:
13277     foreach my $property (sort { defined $b->file } property_ref('*')) {
13278         my $type = $property->type;
13279
13280         # And for each table for that property, starting with the mapping
13281         # table for it ...
13282         TABLE:
13283         foreach my $table($property,
13284
13285                         # and all the match tables for it (if any), sorted so
13286                         # the ones with the shortest associated file name come
13287                         # first.  The length sorting prevents problems of a
13288                         # longer file taking a name that might have to be used
13289                         # by a shorter one.  The alphabetic sorting prevents
13290                         # differences between releases
13291                         sort {  my $ext_a = $a->external_name;
13292                                 return 1 if ! defined $ext_a;
13293                                 my $ext_b = $b->external_name;
13294                                 return -1 if ! defined $ext_b;
13295                                 my $cmp = length $ext_a <=> length $ext_b;
13296
13297                                 # Return result if lengths not equal
13298                                 return $cmp if $cmp;
13299
13300                                 # Alphabetic if lengths equal
13301                                 return $ext_a cmp $ext_b
13302                         } $property->tables
13303                     )
13304         {
13305
13306             # Here we have a table associated with a property.  It could be
13307             # the map table (done first for each property), or one of the
13308             # other tables.  Determine which type.
13309             my $is_property = $table->isa('Property');
13310
13311             my $name = $table->name;
13312             my $complete_name = $table->complete_name;
13313
13314             # See if should suppress the table if is empty, but warn if it
13315             # contains something.
13316             my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
13317                                     keys %why_suppress_if_empty_warn_if_not;
13318
13319             # Calculate if this table should have any code points associated
13320             # with it or not.
13321             my $expected_empty =
13322
13323                 # $perl should be empty, as well as properties that we just
13324                 # don't do anything with
13325                 ($is_property
13326                     && ($table == $perl
13327                         || grep { $complete_name eq $_ }
13328                                                     @unimplemented_properties
13329                     )
13330                 )
13331
13332                 # Match tables in properties we skipped populating should be
13333                 # empty
13334                 || (! $is_property && ! $property->to_create_match_tables)
13335
13336                 # Tables and properties that are expected to have no code
13337                 # points should be empty
13338                 || $suppress_if_empty_warn_if_not
13339             ;
13340
13341             # Set a boolean if this table is the complement of an empty binary
13342             # table
13343             my $is_complement_of_empty_binary =
13344                 $type == $BINARY &&
13345                 (($table == $property->table('Y')
13346                     && $property->table('N')->is_empty)
13347                 || ($table == $property->table('N')
13348                     && $property->table('Y')->is_empty));
13349
13350
13351             # Some tables should match everything
13352             my $expected_full =
13353                 ($is_property)
13354                 ? # All these types of map tables will be full because
13355                   # they will have been populated with defaults
13356                   ($type == $ENUM || $type == $BINARY)
13357
13358                 : # A match table should match everything if its method
13359                   # shows it should
13360                   ($table->matches_all
13361
13362                   # The complement of an empty binary table will match
13363                   # everything
13364                   || $is_complement_of_empty_binary
13365                   )
13366             ;
13367
13368             if ($table->is_empty) {
13369
13370
13371                 if ($suppress_if_empty_warn_if_not) {
13372                     $table->set_status($SUPPRESSED,
13373                         $why_suppress_if_empty_warn_if_not{$complete_name});
13374                 }
13375
13376                 # Suppress expected empty tables.
13377                 next TABLE if $expected_empty;
13378
13379                 # And setup to later output a warning for those that aren't
13380                 # known to be allowed to be empty.  Don't do the warning if
13381                 # this table is a child of another one to avoid duplicating
13382                 # the warning that should come from the parent one.
13383                 if (($table == $property || $table->parent == $table)
13384                     && $table->status ne $SUPPRESSED
13385                     && ! grep { $complete_name =~ /^$_$/ }
13386                                                     @tables_that_may_be_empty)
13387                 {
13388                     push @unhandled_properties, "$table";
13389                 }
13390             }
13391             elsif ($expected_empty) {
13392                 my $because = "";
13393                 if ($suppress_if_empty_warn_if_not) {
13394                     $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
13395                 }
13396
13397                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
13398             }
13399
13400             my $count = $table->count;
13401             if ($expected_full) {
13402                 if ($count != $MAX_UNICODE_CODEPOINTS) {
13403                     Carp::my_carp("$table matches only "
13404                     . clarify_number($count)
13405                     . " Unicode code points but should match "
13406                     . clarify_number($MAX_UNICODE_CODEPOINTS)
13407                     . " (off by "
13408                     .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
13409                     . ").  Proceeding anyway.");
13410                 }
13411
13412                 # Here is expected to be full.  If it is because it is the
13413                 # complement of an (empty) binary table that is to be
13414                 # suppressed, then suppress this one as well.
13415                 if ($is_complement_of_empty_binary) {
13416                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
13417                     my $opposing = $property->table($opposing_name);
13418                     my $opposing_status = $opposing->status;
13419                     if ($opposing_status) {
13420                         $table->set_status($opposing_status,
13421                                            $opposing->status_info);
13422                     }
13423                 }
13424             }
13425             elsif ($count == $MAX_UNICODE_CODEPOINTS) {
13426                 if ($table == $property || $table->leader == $table) {
13427                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
13428                 }
13429             }
13430
13431             if ($table->status eq $SUPPRESSED) {
13432                 if (! $is_property) {
13433                     my @children = $table->children;
13434                     foreach my $child (@children) {
13435                         if ($child->status ne $SUPPRESSED) {
13436                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
13437                         }
13438                     }
13439                 }
13440                 next TABLE;
13441
13442             }
13443             if (! $is_property) {
13444
13445                 # Several things need to be done just once for each related
13446                 # group of match tables.  Do them on the parent.
13447                 if ($table->parent == $table) {
13448
13449                     # Add an entry in the pod file for the table; it also does
13450                     # the children.
13451                     make_table_pod_entries($table) if defined $pod_directory;
13452
13453                     # See if the the table matches identical code points with
13454                     # something that has already been output.  In that case,
13455                     # no need to have two files with the same code points in
13456                     # them.  We use the table's hash() method to store these
13457                     # in buckets, so that it is quite likely that if two
13458                     # tables are in the same bucket they will be identical, so
13459                     # don't have to compare tables frequently.  The tables
13460                     # have to have the same status to share a file, so add
13461                     # this to the bucket hash.  (The reason for this latter is
13462                     # that Heavy.pl associates a status with a file.)
13463                     my $hash = $table->hash . ';' . $table->status;
13464
13465                     # Look at each table that is in the same bucket as this
13466                     # one would be.
13467                     foreach my $comparison (@{$match_tables_to_write{$hash}})
13468                     {
13469                         if ($table->matches_identically_to($comparison)) {
13470                             $table->set_equivalent_to($comparison,
13471                                                                 Related => 0);
13472                             next TABLE;
13473                         }
13474                     }
13475
13476                     # Here, not equivalent, add this table to the bucket.
13477                     push @{$match_tables_to_write{$hash}}, $table;
13478                 }
13479             }
13480             else {
13481
13482                 # Here is the property itself.
13483                 # Don't write out or make references to the $perl property
13484                 next if $table == $perl;
13485
13486                 if ($type != $STRING) {
13487
13488                     # There is a mapping stored of the various synonyms to the
13489                     # standardized name of the property for utf8_heavy.pl.
13490                     # Also, the pod file contains entries of the form:
13491                     # \p{alias: *}         \p{full: *}
13492                     # rather than show every possible combination of things.
13493
13494                     my @property_aliases = $property->aliases;
13495
13496                     # The full name of this property is stored by convention
13497                     # first in the alias array
13498                     my $full_property_name =
13499                                 '\p{' . $property_aliases[0]->name . ': *}';
13500                     my $standard_property_name = standardize($table->name);
13501
13502                     # For each synonym ...
13503                     for my $i (0 .. @property_aliases - 1)  {
13504                         my $alias = $property_aliases[$i];
13505                         my $alias_name = $alias->name;
13506                         my $alias_standard = standardize($alias_name);
13507
13508                         # Set the mapping for utf8_heavy of the alias to the
13509                         # property
13510                         if (exists ($loose_property_name_of{$alias_standard}))
13511                         {
13512                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
13513                         }
13514                         else {
13515                             $loose_property_name_of{$alias_standard}
13516                                                 = $standard_property_name;
13517                         }
13518
13519                         # Now for the pod entry for this alias.  Skip if not
13520                         # outputting a pod; skip the first one, which is the
13521                         # full name so won't have an entry like: '\p{full: *}
13522                         # \p{full: *}', and skip if don't want an entry for
13523                         # this one.
13524                         next if $i == 0
13525                                 || ! defined $pod_directory
13526                                 || ! $alias->make_pod_entry;
13527
13528                         my $rhs = $full_property_name;
13529                         if ($property != $perl && $table->perl_extension) {
13530                             $rhs .= ' (Perl extension)';
13531                         }
13532                         push @match_properties,
13533                             format_pod_line($indent_info_column,
13534                                         '\p{' . $alias->name . ': *}',
13535                                         $rhs,
13536                                         $alias->status);
13537                     }
13538                 } # End of non-string-like property code
13539
13540
13541                 # Don't output a mapping file if not desired.
13542                 next if ! $property->to_output_map;
13543             }
13544
13545             # Here, we know we want to write out the table, but don't do it
13546             # yet because there may be other tables that come along and will
13547             # want to share the file, and the file's comments will change to
13548             # mention them.  So save for later.
13549             push @writables, $table;
13550
13551         } # End of looping through the property and all its tables.
13552     } # End of looping through all properties.
13553
13554     # Now have all the tables that will have files written for them.  Do it.
13555     foreach my $table (@writables) {
13556         my @directory;
13557         my $filename;
13558         my $property = $table->property;
13559         my $is_property = ($table == $property);
13560         if (! $is_property) {
13561
13562             # Match tables for the property go in lib/$subdirectory, which is
13563             # the property's name.  Don't use the standard file name for this,
13564             # as may get an unfamiliar alias
13565             @directory = ($matches_directory, $property->external_name);
13566         }
13567         else {
13568
13569             @directory = $table->directory;
13570             $filename = $table->file;
13571         }
13572
13573         # Use specified filename if available, or default to property's
13574         # shortest name.  We need an 8.3 safe filename (which means "an 8
13575         # safe" filename, since after the dot is only 'pl', which is < 3)
13576         # The 2nd parameter is if the filename shouldn't be changed, and
13577         # it shouldn't iff there is a hard-coded name for this table.
13578         $filename = construct_filename(
13579                                 $filename || $table->external_name,
13580                                 ! $filename,    # mutable if no filename
13581                                 \@directory);
13582
13583         register_file_for_name($table, \@directory, $filename);
13584
13585         # Only need to write one file when shared by more than one
13586         # property
13587         next if ! $is_property && $table->leader != $table;
13588
13589         # Construct a nice comment to add to the file
13590         $table->set_final_comment;
13591
13592         $table->write;
13593     }
13594
13595
13596     # Write out the pod file
13597     make_pod;
13598
13599     # And Heavy.pl
13600     make_Heavy;
13601
13602     make_property_test_script() if $make_test_script;
13603     return;
13604 }
13605
13606 my @white_space_separators = ( # This used only for making the test script.
13607                             "",
13608                             ' ',
13609                             "\t",
13610                             '   '
13611                         );
13612
13613 sub generate_separator($) {
13614     # This used only for making the test script.  It generates the colon or
13615     # equal separator between the property and property value, with random
13616     # white space surrounding the separator
13617
13618     my $lhs = shift;
13619
13620     return "" if $lhs eq "";  # No separator if there's only one (the r) side
13621
13622     # Choose space before and after randomly
13623     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
13624     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
13625
13626     # And return the whole complex, half the time using a colon, half the
13627     # equals
13628     return $spaces_before
13629             . (rand() < 0.5) ? '=' : ':'
13630             . $spaces_after;
13631 }
13632
13633 sub generate_tests($$$$$) {
13634     # This used only for making the test script.  It generates test cases that
13635     # are expected to compile successfully in perl.  Note that the lhs and
13636     # rhs are assumed to already be as randomized as the caller wants.
13637
13638     my $lhs = shift;           # The property: what's to the left of the colon
13639                                #  or equals separator
13640     my $rhs = shift;           # The property value; what's to the right
13641     my $valid_code = shift;    # A code point that's known to be in the
13642                                # table given by lhs=rhs; undef if table is
13643                                # empty
13644     my $invalid_code = shift;  # A code point known to not be in the table;
13645                                # undef if the table is all code points
13646     my $warning = shift;
13647
13648     # Get the colon or equal
13649     my $separator = generate_separator($lhs);
13650
13651     # The whole 'property=value'
13652     my $name = "$lhs$separator$rhs";
13653
13654     my @output;
13655     # Create a complete set of tests, with complements.
13656     if (defined $valid_code) {
13657         push @output, <<"EOC"
13658 Expect(1, $valid_code, '\\p{$name}', $warning);
13659 Expect(0, $valid_code, '\\p{^$name}', $warning);
13660 Expect(0, $valid_code, '\\P{$name}', $warning);
13661 Expect(1, $valid_code, '\\P{^$name}', $warning);
13662 EOC
13663     }
13664     if (defined $invalid_code) {
13665         push @output, <<"EOC"
13666 Expect(0, $invalid_code, '\\p{$name}', $warning);
13667 Expect(1, $invalid_code, '\\p{^$name}', $warning);
13668 Expect(1, $invalid_code, '\\P{$name}', $warning);
13669 Expect(0, $invalid_code, '\\P{^$name}', $warning);
13670 EOC
13671     }
13672     return @output;
13673 }
13674
13675 sub generate_error($$$) {
13676     # This used only for making the test script.  It generates test cases that
13677     # are expected to not only not match, but to be syntax or similar errors
13678
13679     my $lhs = shift;                # The property: what's to the left of the
13680                                     # colon or equals separator
13681     my $rhs = shift;                # The property value; what's to the right
13682     my $already_in_error = shift;   # Boolean; if true it's known that the
13683                                 # unmodified lhs and rhs will cause an error.
13684                                 # This routine should not force another one
13685     # Get the colon or equal
13686     my $separator = generate_separator($lhs);
13687
13688     # Since this is an error only, don't bother to randomly decide whether to
13689     # put the error on the left or right side; and assume that the rhs is
13690     # loosely matched, again for convenience rather than rigor.
13691     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
13692
13693     my $property = $lhs . $separator . $rhs;
13694
13695     return <<"EOC";
13696 Error('\\p{$property}');
13697 Error('\\P{$property}');
13698 EOC
13699 }
13700
13701 # These are used only for making the test script
13702 # XXX Maybe should also have a bad strict seps, which includes underscore.
13703
13704 my @good_loose_seps = (
13705             " ",
13706             "-",
13707             "\t",
13708             "",
13709             "_",
13710            );
13711 my @bad_loose_seps = (
13712            "/a/",
13713            ':=',
13714           );
13715
13716 sub randomize_stricter_name {
13717     # This used only for making the test script.  Take the input name and
13718     # return a randomized, but valid version of it under the stricter matching
13719     # rules.
13720
13721     my $name = shift;
13722     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13723
13724     # If the name looks like a number (integer, floating, or rational), do
13725     # some extra work
13726     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
13727         my $sign = $1;
13728         my $number = $2;
13729         my $separator = $3;
13730
13731         # If there isn't a sign, part of the time add a plus
13732         # Note: Not testing having any denominator having a minus sign
13733         if (! $sign) {
13734             $sign = '+' if rand() <= .3;
13735         }
13736
13737         # And add 0 or more leading zeros.
13738         $name = $sign . ('0' x int rand(10)) . $number;
13739
13740         if (defined $separator) {
13741             my $extra_zeros = '0' x int rand(10);
13742
13743             if ($separator eq '.') {
13744
13745                 # Similarly, add 0 or more trailing zeros after a decimal
13746                 # point
13747                 $name .= $extra_zeros;
13748             }
13749             else {
13750
13751                 # Or, leading zeros before the denominator
13752                 $name =~ s,/,/$extra_zeros,;
13753             }
13754         }
13755     }
13756
13757     # For legibility of the test, only change the case of whole sections at a
13758     # time.  To do this, first split into sections.  The split returns the
13759     # delimiters
13760     my @sections;
13761     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
13762         trace $section if main::DEBUG && $to_trace;
13763
13764         if (length $section > 1 && $section !~ /\D/) {
13765
13766             # If the section is a sequence of digits, about half the time
13767             # randomly add underscores between some of them.
13768             if (rand() > .5) {
13769
13770                 # Figure out how many underscores to add.  max is 1 less than
13771                 # the number of digits.  (But add 1 at the end to make sure
13772                 # result isn't 0, and compensate earlier by subtracting 2
13773                 # instead of 1)
13774                 my $num_underscores = int rand(length($section) - 2) + 1;
13775
13776                 # And add them evenly throughout, for convenience, not rigor
13777                 use integer;
13778                 my $spacing = (length($section) - 1)/ $num_underscores;
13779                 my $temp = $section;
13780                 $section = "";
13781                 for my $i (1 .. $num_underscores) {
13782                     $section .= substr($temp, 0, $spacing, "") . '_';
13783                 }
13784                 $section .= $temp;
13785             }
13786             push @sections, $section;
13787         }
13788         else {
13789
13790             # Here not a sequence of digits.  Change the case of the section
13791             # randomly
13792             my $switch = int rand(4);
13793             if ($switch == 0) {
13794                 push @sections, uc $section;
13795             }
13796             elsif ($switch == 1) {
13797                 push @sections, lc $section;
13798             }
13799             elsif ($switch == 2) {
13800                 push @sections, ucfirst $section;
13801             }
13802             else {
13803                 push @sections, $section;
13804             }
13805         }
13806     }
13807     trace "returning", join "", @sections if main::DEBUG && $to_trace;
13808     return join "", @sections;
13809 }
13810
13811 sub randomize_loose_name($;$) {
13812     # This used only for making the test script
13813
13814     my $name = shift;
13815     my $want_error = shift;  # if true, make an error
13816     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13817
13818     $name = randomize_stricter_name($name);
13819
13820     my @parts;
13821     push @parts, $good_loose_seps[rand(@good_loose_seps)];
13822     for my $part (split /[-\s_]+/, $name) {
13823         if (@parts) {
13824             if ($want_error and rand() < 0.3) {
13825                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
13826                 $want_error = 0;
13827             }
13828             else {
13829                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13830             }
13831         }
13832         push @parts, $part;
13833     }
13834     my $new = join("", @parts);
13835     trace "$name => $new" if main::DEBUG && $to_trace;
13836
13837     if ($want_error) {
13838         if (rand() >= 0.5) {
13839             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
13840         }
13841         else {
13842             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
13843         }
13844     }
13845     return $new;
13846 }
13847
13848 # Used to make sure don't generate duplicate test cases.
13849 my %test_generated;
13850
13851 sub make_property_test_script() {
13852     # This used only for making the test script
13853     # this written directly -- it's huge.
13854
13855     print "Making test script\n" if $verbosity >= $PROGRESS;
13856
13857     # This uses randomness to test different possibilities without testing all
13858     # possibilities.  To ensure repeatability, set the seed to 0.  But if
13859     # tests are added, it will perturb all later ones in the .t file
13860     srand 0;
13861
13862     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
13863
13864     # Keep going down an order of magnitude
13865     # until find that adding this quantity to
13866     # 1 remains 1; but put an upper limit on
13867     # this so in case this algorithm doesn't
13868     # work properly on some platform, that we
13869     # won't loop forever.
13870     my $digits = 0;
13871     my $min_floating_slop = 1;
13872     while (1+ $min_floating_slop != 1
13873             && $digits++ < 50)
13874     {
13875         my $next = $min_floating_slop / 10;
13876         last if $next == 0; # If underflows,
13877                             # use previous one
13878         $min_floating_slop = $next;
13879     }
13880
13881     # It doesn't matter whether the elements of this array contain single lines
13882     # or multiple lines. main::write doesn't count the lines.
13883     my @output;
13884
13885     foreach my $property (property_ref('*')) {
13886         foreach my $table ($property->tables) {
13887
13888             # Find code points that match, and don't match this table.
13889             my $valid = $table->get_valid_code_point;
13890             my $invalid = $table->get_invalid_code_point;
13891             my $warning = ($table->status eq $DEPRECATED)
13892                             ? "'deprecated'"
13893                             : '""';
13894
13895             # Test each possible combination of the property's aliases with
13896             # the table's.  If this gets to be too many, could do what is done
13897             # in the set_final_comment() for Tables
13898             my @table_aliases = $table->aliases;
13899             my @property_aliases = $table->property->aliases;
13900             my $max = max(scalar @table_aliases, scalar @property_aliases);
13901             for my $j (0 .. $max - 1) {
13902
13903                 # The current alias for property is the next one on the list,
13904                 # or if beyond the end, start over.  Similarly for table
13905                 my $property_name
13906                             = $property_aliases[$j % @property_aliases]->name;
13907
13908                 $property_name = "" if $table->property == $perl;
13909                 my $table_alias = $table_aliases[$j % @table_aliases];
13910                 my $table_name = $table_alias->name;
13911                 my $loose_match = $table_alias->loose_match;
13912
13913                 # If the table doesn't have a file, any test for it is
13914                 # already guaranteed to be in error
13915                 my $already_error = ! $table->file_path;
13916
13917                 # Generate error cases for this alias.
13918                 push @output, generate_error($property_name,
13919                                              $table_name,
13920                                              $already_error);
13921
13922                 # If the table is guaranteed to always generate an error,
13923                 # quit now without generating success cases.
13924                 next if $already_error;
13925
13926                 # Now for the success cases.
13927                 my $random;
13928                 if ($loose_match) {
13929
13930                     # For loose matching, create an extra test case for the
13931                     # standard name.
13932                     my $standard = standardize($table_name);
13933
13934                     # $test_name should be a unique combination for each test
13935                     # case; used just to avoid duplicate tests
13936                     my $test_name = "$property_name=$standard";
13937
13938                     # Don't output duplicate test cases.
13939                     if (! exists $test_generated{$test_name}) {
13940                         $test_generated{$test_name} = 1;
13941                         push @output, generate_tests($property_name,
13942                                                      $standard,
13943                                                      $valid,
13944                                                      $invalid,
13945                                                      $warning,
13946                                                  );
13947                     }
13948                     $random = randomize_loose_name($table_name)
13949                 }
13950                 else { # Stricter match
13951                     $random = randomize_stricter_name($table_name);
13952                 }
13953
13954                 # Now for the main test case for this alias.
13955                 my $test_name = "$property_name=$random";
13956                 if (! exists $test_generated{$test_name}) {
13957                     $test_generated{$test_name} = 1;
13958                     push @output, generate_tests($property_name,
13959                                                  $random,
13960                                                  $valid,
13961                                                  $invalid,
13962                                                  $warning,
13963                                              );
13964
13965                     # If the name is a rational number, add tests for the
13966                     # floating point equivalent.
13967                     if ($table_name =~ qr{/}) {
13968
13969                         # Calculate the float, and find just the fraction.
13970                         my $float = eval $table_name;
13971                         my ($whole, $fraction)
13972                                             = $float =~ / (.*) \. (.*) /x;
13973
13974                         # Starting with one digit after the decimal point,
13975                         # create a test for each possible precision (number of
13976                         # digits past the decimal point) until well beyond the
13977                         # native number found on this machine.  (If we started
13978                         # with 0 digits, it would be an integer, which could
13979                         # well match an unrelated table)
13980                         PLACE:
13981                         for my $i (1 .. $min_floating_slop + 3) {
13982                             my $table_name = sprintf("%.*f", $i, $float);
13983                             if ($i < $MIN_FRACTION_LENGTH) {
13984
13985                                 # If the test case has fewer digits than the
13986                                 # minimum acceptable precision, it shouldn't
13987                                 # succeed, so we expect an error for it.
13988                                 # E.g., 2/3 = .7 at one decimal point, and we
13989                                 # shouldn't say it matches .7.  We should make
13990                                 # it be .667 at least before agreeing that the
13991                                 # intent was to match 2/3.  But at the
13992                                 # less-than- acceptable level of precision, it
13993                                 # might actually match an unrelated number.
13994                                 # So don't generate a test case if this
13995                                 # conflating is possible.  In our example, we
13996                                 # don't want 2/3 matching 7/10, if there is
13997                                 # a 7/10 code point.
13998                                 for my $existing
13999                                         (keys %nv_floating_to_rational)
14000                                 {
14001                                     next PLACE
14002                                         if abs($table_name - $existing)
14003                                                 < $MAX_FLOATING_SLOP;
14004                                 }
14005                                 push @output, generate_error($property_name,
14006                                                              $table_name,
14007                                                              1   # 1 => already an error
14008                                               );
14009                             }
14010                             else {
14011
14012                                 # Here the number of digits exceeds the
14013                                 # minimum we think is needed.  So generate a
14014                                 # success test case for it.
14015                                 push @output, generate_tests($property_name,
14016                                                              $table_name,
14017                                                              $valid,
14018                                                              $invalid,
14019                                                              $warning,
14020                                              );
14021                             }
14022                         }
14023                     }
14024                 }
14025             }
14026         }
14027     }
14028
14029     &write($t_path,
14030            0,           # Not utf8;
14031            [<DATA>,
14032             @output,
14033             (map {"Test_X('$_');\n"} @backslash_X_tests),
14034             "Finished();\n"]);
14035     return;
14036 }
14037
14038 # This is a list of the input files and how to handle them.  The files are
14039 # processed in their order in this list.  Some reordering is possible if
14040 # desired, but the v0 files should be first, and the extracted before the
14041 # others except DAge.txt (as data in an extracted file can be over-ridden by
14042 # the non-extracted.  Some other files depend on data derived from an earlier
14043 # file, like UnicodeData requires data from Jamo, and the case changing and
14044 # folding requires data from Unicode.  Mostly, it safest to order by first
14045 # version releases in (except the Jamo).  DAge.txt is read before the
14046 # extracted ones because of the rarely used feature $compare_versions.  In the
14047 # unlikely event that there were ever an extracted file that contained the Age
14048 # property information, it would have to go in front of DAge.
14049 #
14050 # The version strings allow the program to know whether to expect a file or
14051 # not, but if a file exists in the directory, it will be processed, even if it
14052 # is in a version earlier than expected, so you can copy files from a later
14053 # release into an earlier release's directory.
14054 my @input_file_objects = (
14055     Input_file->new('PropertyAliases.txt', v0,
14056                     Handler => \&process_PropertyAliases,
14057                     ),
14058     Input_file->new(undef, v0,  # No file associated with this
14059                     Progress_Message => 'Finishing property setup',
14060                     Handler => \&finish_property_setup,
14061                     ),
14062     Input_file->new('PropValueAliases.txt', v0,
14063                      Handler => \&process_PropValueAliases,
14064                      Has_Missings_Defaults => $NOT_IGNORED,
14065                      ),
14066     Input_file->new('DAge.txt', v3.2.0,
14067                     Has_Missings_Defaults => $NOT_IGNORED,
14068                     Property => 'Age'
14069                     ),
14070     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
14071                     Property => 'General_Category',
14072                     ),
14073     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
14074                     Property => 'Canonical_Combining_Class',
14075                     Has_Missings_Defaults => $NOT_IGNORED,
14076                     ),
14077     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
14078                     Property => 'Numeric_Type',
14079                     Has_Missings_Defaults => $NOT_IGNORED,
14080                     ),
14081     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
14082                     Property => 'East_Asian_Width',
14083                     Has_Missings_Defaults => $NOT_IGNORED,
14084                     ),
14085     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
14086                     Property => 'Line_Break',
14087                     Has_Missings_Defaults => $NOT_IGNORED,
14088                     ),
14089     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
14090                     Property => 'Bidi_Class',
14091                     Has_Missings_Defaults => $NOT_IGNORED,
14092                     ),
14093     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
14094                     Property => 'Decomposition_Type',
14095                     Has_Missings_Defaults => $NOT_IGNORED,
14096                     ),
14097     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
14098     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
14099                     Property => 'Numeric_Value',
14100                     Each_Line_Handler => \&filter_numeric_value_line,
14101                     Has_Missings_Defaults => $NOT_IGNORED,
14102                     ),
14103     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
14104                     Property => 'Joining_Group',
14105                     Has_Missings_Defaults => $NOT_IGNORED,
14106                     ),
14107
14108     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
14109                     Property => 'Joining_Type',
14110                     Has_Missings_Defaults => $NOT_IGNORED,
14111                     ),
14112     Input_file->new('Jamo.txt', v2.0.0,
14113                     Property => 'Jamo_Short_Name',
14114                     Each_Line_Handler => \&filter_jamo_line,
14115                     ),
14116     Input_file->new('UnicodeData.txt', v1.1.5,
14117                     Pre_Handler => \&setup_UnicodeData,
14118
14119                     # We clean up this file for some early versions.
14120                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
14121                                             ? \&filter_v1_ucd
14122                                             : ($v_version eq v2.1.5)
14123                                                 ? \&filter_v2_1_5_ucd
14124
14125                                                 # And for 5.14 Perls with 6.0,
14126                                                 # have to also make changes
14127                                                 : ($v_version ge v6.0.0)
14128                                                     ? \&filter_v6_ucd
14129                                                     : undef),
14130
14131                                             # And the main filter
14132                                             \&filter_UnicodeData_line,
14133                                          ],
14134                     EOF_Handler => \&EOF_UnicodeData,
14135                     ),
14136     Input_file->new('ArabicShaping.txt', v2.0.0,
14137                     Each_Line_Handler =>
14138                         [ ($v_version lt 4.1.0)
14139                                     ? \&filter_old_style_arabic_shaping
14140                                     : undef,
14141                         \&filter_arabic_shaping_line,
14142                         ],
14143                     Has_Missings_Defaults => $NOT_IGNORED,
14144                     ),
14145     Input_file->new('Blocks.txt', v2.0.0,
14146                     Property => 'Block',
14147                     Has_Missings_Defaults => $NOT_IGNORED,
14148                     Each_Line_Handler => \&filter_blocks_lines
14149                     ),
14150     Input_file->new('PropList.txt', v2.0.0,
14151                     Each_Line_Handler => (($v_version lt v3.1.0)
14152                                             ? \&filter_old_style_proplist
14153                                             : undef),
14154                     ),
14155     Input_file->new('Unihan.txt', v2.0.0,
14156                     Pre_Handler => \&setup_unihan,
14157                     Optional => 1,
14158                     Each_Line_Handler => \&filter_unihan_line,
14159                         ),
14160     Input_file->new('SpecialCasing.txt', v2.1.8,
14161                     Each_Line_Handler => \&filter_special_casing_line,
14162                     Pre_Handler => \&setup_special_casing,
14163                     ),
14164     Input_file->new(
14165                     'LineBreak.txt', v3.0.0,
14166                     Has_Missings_Defaults => $NOT_IGNORED,
14167                     Property => 'Line_Break',
14168                     # Early versions had problematic syntax
14169                     Each_Line_Handler => (($v_version lt v3.1.0)
14170                                         ? \&filter_early_ea_lb
14171                                         : undef),
14172                     ),
14173     Input_file->new('EastAsianWidth.txt', v3.0.0,
14174                     Property => 'East_Asian_Width',
14175                     Has_Missings_Defaults => $NOT_IGNORED,
14176                     # Early versions had problematic syntax
14177                     Each_Line_Handler => (($v_version lt v3.1.0)
14178                                         ? \&filter_early_ea_lb
14179                                         : undef),
14180                     ),
14181     Input_file->new('CompositionExclusions.txt', v3.0.0,
14182                     Property => 'Composition_Exclusion',
14183                     ),
14184     Input_file->new('BidiMirroring.txt', v3.0.1,
14185                     Property => 'Bidi_Mirroring_Glyph',
14186                     ),
14187     Input_file->new("NormalizationTest.txt", v3.0.1,
14188                     Skip => 1,
14189                     ),
14190     Input_file->new('CaseFolding.txt', v3.0.1,
14191                     Pre_Handler => \&setup_case_folding,
14192                     Each_Line_Handler =>
14193                         [ ($v_version lt v3.1.0)
14194                                  ? \&filter_old_style_case_folding
14195                                  : undef,
14196                            \&filter_case_folding_line
14197                         ],
14198                     Post_Handler => \&post_fold,
14199                     ),
14200     Input_file->new('DCoreProperties.txt', v3.1.0,
14201                     # 5.2 changed this file
14202                     Has_Missings_Defaults => (($v_version ge v5.2.0)
14203                                             ? $NOT_IGNORED
14204                                             : $NO_DEFAULTS),
14205                     ),
14206     Input_file->new('Scripts.txt', v3.1.0,
14207                     Property => 'Script',
14208                     Has_Missings_Defaults => $NOT_IGNORED,
14209                     ),
14210     Input_file->new('DNormalizationProps.txt', v3.1.0,
14211                     Has_Missings_Defaults => $NOT_IGNORED,
14212                     Each_Line_Handler => (($v_version lt v4.0.1)
14213                                       ? \&filter_old_style_normalization_lines
14214                                       : undef),
14215                     ),
14216     Input_file->new('HangulSyllableType.txt', v4.0.0,
14217                     Has_Missings_Defaults => $NOT_IGNORED,
14218                     Property => 'Hangul_Syllable_Type'),
14219     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
14220                     Property => 'Word_Break',
14221                     Has_Missings_Defaults => $NOT_IGNORED,
14222                     ),
14223     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
14224                     Property => 'Grapheme_Cluster_Break',
14225                     Has_Missings_Defaults => $NOT_IGNORED,
14226                     ),
14227     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
14228                     Handler => \&process_GCB_test,
14229                     ),
14230     Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
14231                     Skip => 1,
14232                     ),
14233     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
14234                     Skip => 1,
14235                     ),
14236     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
14237                     Skip => 1,
14238                     ),
14239     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
14240                     Property => 'Sentence_Break',
14241                     Has_Missings_Defaults => $NOT_IGNORED,
14242                     ),
14243     Input_file->new('NamedSequences.txt', v4.1.0,
14244                     Handler => \&process_NamedSequences
14245                     ),
14246     Input_file->new('NameAliases.txt', v5.0.0,
14247                     Property => 'Name_Alias',
14248                     ),
14249     Input_file->new("BidiTest.txt", v5.2.0,
14250                     Skip => 1,
14251                     ),
14252     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
14253                     Optional => 1,
14254                     Each_Line_Handler => \&filter_unihan_line,
14255                     ),
14256     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
14257                     Optional => 1,
14258                     Each_Line_Handler => \&filter_unihan_line,
14259                     ),
14260     Input_file->new('UnihanIRGSources.txt', v5.2.0,
14261                     Optional => 1,
14262                     Pre_Handler => \&setup_unihan,
14263                     Each_Line_Handler => \&filter_unihan_line,
14264                     ),
14265     Input_file->new('UnihanNumericValues.txt', v5.2.0,
14266                     Optional => 1,
14267                     Each_Line_Handler => \&filter_unihan_line,
14268                     ),
14269     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
14270                     Optional => 1,
14271                     Each_Line_Handler => \&filter_unihan_line,
14272                     ),
14273     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
14274                     Optional => 1,
14275                     Each_Line_Handler => \&filter_unihan_line,
14276                     ),
14277     Input_file->new('UnihanReadings.txt', v5.2.0,
14278                     Optional => 1,
14279                     Each_Line_Handler => \&filter_unihan_line,
14280                     ),
14281     Input_file->new('UnihanVariants.txt', v5.2.0,
14282                     Optional => 1,
14283                     Each_Line_Handler => \&filter_unihan_line,
14284                     ),
14285 );
14286
14287 # End of all the preliminaries.
14288 # Do it...
14289
14290 if ($compare_versions) {
14291     Carp::my_carp(<<END
14292 Warning.  \$compare_versions is set.  Output is not suitable for production
14293 END
14294     );
14295 }
14296
14297 # Put into %potential_files a list of all the files in the directory structure
14298 # that could be inputs to this program, excluding those that we should ignore.
14299 # Use absolute file names because it makes it easier across machine types.
14300 my @ignored_files_full_names = map { File::Spec->rel2abs(
14301                                      internal_file_to_platform($_))
14302                                 } keys %ignored_files;
14303 File::Find::find({
14304     wanted=>sub {
14305         return unless /\.txt$/i;  # Some platforms change the name's case
14306         my $full = lc(File::Spec->rel2abs($_));
14307         $potential_files{$full} = 1
14308                     if ! grep { $full eq lc($_) } @ignored_files_full_names;
14309         return;
14310     }
14311 }, File::Spec->curdir());
14312
14313 my @mktables_list_output_files;
14314 my $old_start_time = 0;
14315
14316 if (! -e $file_list) {
14317     print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
14318     $write_unchanged_files = 1;
14319 } elsif ($write_unchanged_files) {
14320     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
14321 }
14322 else {
14323     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
14324     my $file_handle;
14325     if (! open $file_handle, "<", $file_list) {
14326         Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
14327         $glob_list = 1;
14328     }
14329     else {
14330         my @input;
14331
14332         # Read and parse mktables.lst, placing the results from the first part
14333         # into @input, and the second part into @mktables_list_output_files
14334         for my $list ( \@input, \@mktables_list_output_files ) {
14335             while (<$file_handle>) {
14336                 s/^ \s+ | \s+ $//xg;
14337                 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
14338                     $old_start_time = $1;
14339                 }
14340                 next if /^ \s* (?: \# .* )? $/x;
14341                 last if /^ =+ $/x;
14342                 my ( $file ) = split /\t/;
14343                 push @$list, $file;
14344             }
14345             @$list = uniques(@$list);
14346             next;
14347         }
14348
14349         # Look through all the input files
14350         foreach my $input (@input) {
14351             next if $input eq 'version'; # Already have checked this.
14352
14353             # Ignore if doesn't exist.  The checking about whether we care or
14354             # not is done via the Input_file object.
14355             next if ! file_exists($input);
14356
14357             # The paths are stored with relative names, and with '/' as the
14358             # delimiter; convert to absolute on this machine
14359             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
14360             $potential_files{$full} = 1
14361                         if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
14362         }
14363     }
14364
14365     close $file_handle;
14366 }
14367
14368 if ($glob_list) {
14369
14370     # Here wants to process all .txt files in the directory structure.
14371     # Convert them to full path names.  They are stored in the platform's
14372     # relative style
14373     my @known_files;
14374     foreach my $object (@input_file_objects) {
14375         my $file = $object->file;
14376         next unless defined $file;
14377         push @known_files, File::Spec->rel2abs($file);
14378     }
14379
14380     my @unknown_input_files;
14381     foreach my $file (keys %potential_files) {
14382         next if grep { lc($file) eq lc($_) } @known_files;
14383
14384         # Here, the file is unknown to us.  Get relative path name
14385         $file = File::Spec->abs2rel($file);
14386         push @unknown_input_files, $file;
14387
14388         # What will happen is we create a data structure for it, and add it to
14389         # the list of input files to process.  First get the subdirectories
14390         # into an array
14391         my (undef, $directories, undef) = File::Spec->splitpath($file);
14392         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
14393         my @directories = File::Spec->splitdir($directories);
14394
14395         # If the file isn't extracted (meaning none of the directories is the
14396         # extracted one), just add it to the end of the list of inputs.
14397         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
14398             push @input_file_objects, Input_file->new($file, v0);
14399         }
14400         else {
14401
14402             # Here, the file is extracted.  It needs to go ahead of most other
14403             # processing.  Search for the first input file that isn't a
14404             # special required property (that is, find one whose first_release
14405             # is non-0), and isn't extracted.  Also, the Age property file is
14406             # processed before the extracted ones, just in case
14407             # $compare_versions is set.
14408             for (my $i = 0; $i < @input_file_objects; $i++) {
14409                 if ($input_file_objects[$i]->first_released ne v0
14410                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
14411                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
14412                 {
14413                     splice @input_file_objects, $i, 0,
14414                                                 Input_file->new($file, v0);
14415                     last;
14416                 }
14417             }
14418
14419         }
14420     }
14421     if (@unknown_input_files) {
14422         print STDERR simple_fold(join_lines(<<END
14423
14424 The following files are unknown as to how to handle.  Assuming they are
14425 typical property files.  You'll know by later error messages if it worked or
14426 not:
14427 END
14428         ) . " " . join(", ", @unknown_input_files) . "\n\n");
14429     }
14430 } # End of looking through directory structure for more .txt files.
14431
14432 # Create the list of input files from the objects we have defined, plus
14433 # version
14434 my @input_files = 'version';
14435 foreach my $object (@input_file_objects) {
14436     my $file = $object->file;
14437     next if ! defined $file;    # Not all objects have files
14438     next if $object->optional && ! -e $file;
14439     push @input_files,  $file;
14440 }
14441
14442 if ( $verbosity >= $VERBOSE ) {
14443     print "Expecting ".scalar( @input_files )." input files. ",
14444          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
14445 }
14446
14447 # We set $most_recent to be the most recently changed input file, including
14448 # this program itself (done much earlier in this file)
14449 foreach my $in (@input_files) {
14450     next unless -e $in;        # Keep going even if missing a file
14451     my $mod_time = (stat $in)[9];
14452     $most_recent = $mod_time if $mod_time > $most_recent;
14453
14454     # See that the input files have distinct names, to warn someone if they
14455     # are adding a new one
14456     if ($make_list) {
14457         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
14458         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
14459         my @directories = File::Spec->splitdir($directories);
14460         my $base = $file =~ s/\.txt$//;
14461         construct_filename($file, 'mutable', \@directories);
14462     }
14463 }
14464
14465 my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
14466               || ! scalar @mktables_list_output_files  # or if no outputs known
14467               || $old_start_time < $most_recent;       # or out-of-date
14468
14469 # Now we check to see if any output files are older than youngest, if
14470 # they are, we need to continue on, otherwise we can presumably bail.
14471 if (! $rebuild) {
14472     foreach my $out (@mktables_list_output_files) {
14473         if ( ! file_exists($out)) {
14474             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
14475             $rebuild = 1;
14476             last;
14477          }
14478         #local $to_trace = 1 if main::DEBUG;
14479         trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
14480         if ( (stat $out)[9] <= $most_recent ) {
14481             #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
14482             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
14483             $rebuild = 1;
14484             last;
14485         }
14486     }
14487 }
14488 if (! $rebuild) {
14489     print "Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
14490     exit(0);
14491 }
14492 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
14493
14494 # Ready to do the major processing.  First create the perl pseudo-property.
14495 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
14496
14497 # Process each input file
14498 foreach my $file (@input_file_objects) {
14499     $file->run;
14500 }
14501
14502 # Finish the table generation.
14503
14504 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
14505 finish_Unicode();
14506
14507 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
14508 compile_perl();
14509
14510 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
14511 add_perl_synonyms();
14512
14513 print "Writing tables\n" if $verbosity >= $PROGRESS;
14514 write_all_tables();
14515
14516 # Write mktables.lst
14517 if ( $file_list and $make_list ) {
14518
14519     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
14520     foreach my $file (@input_files, @files_actually_output) {
14521         my (undef, $directories, $file) = File::Spec->splitpath($file);
14522         my @directories = File::Spec->splitdir($directories);
14523         $file = join '/', @directories, $file;
14524     }
14525
14526     my $ofh;
14527     if (! open $ofh,">",$file_list) {
14528         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
14529         return
14530     }
14531     else {
14532         my $localtime = localtime $start_time;
14533         print $ofh <<"END";
14534 #
14535 # $file_list -- File list for $0.
14536 #
14537 #   Autogenerated starting on $start_time ($localtime)
14538 #
14539 # - First section is input files
14540 #   ($0 itself is not listed but is automatically considered an input)
14541 # - Section separator is /^=+\$/
14542 # - Second section is a list of output files.
14543 # - Lines matching /^\\s*#/ are treated as comments
14544 #   which along with blank lines are ignored.
14545 #
14546
14547 # Input files:
14548
14549 END
14550         print $ofh "$_\n" for sort(@input_files);
14551         print $ofh "\n=================================\n# Output files:\n\n";
14552         print $ofh "$_\n" for sort @files_actually_output;
14553         print $ofh "\n# ",scalar(@input_files)," input files\n",
14554                 "# ",scalar(@files_actually_output)+1," output files\n\n",
14555                 "# End list\n";
14556         close $ofh
14557             or Carp::my_carp("Failed to close $ofh: $!");
14558
14559         print "Filelist has ",scalar(@input_files)," input files and ",
14560             scalar(@files_actually_output)+1," output files\n"
14561             if $verbosity >= $VERBOSE;
14562     }
14563 }
14564
14565 # Output these warnings unless -q explicitly specified.
14566 if ($verbosity >= $NORMAL_VERBOSITY) {
14567     if (@unhandled_properties) {
14568         print "\nProperties and tables that unexpectedly have no code points\n";
14569         foreach my $property (sort @unhandled_properties) {
14570             print $property, "\n";
14571         }
14572     }
14573
14574     if (%potential_files) {
14575         print "\nInput files that are not considered:\n";
14576         foreach my $file (sort keys %potential_files) {
14577             print File::Spec->abs2rel($file), "\n";
14578         }
14579     }
14580     print "\nAll done\n" if $verbosity >= $VERBOSE;
14581 }
14582 exit(0);
14583
14584 # TRAILING CODE IS USED BY make_property_test_script()
14585 __DATA__
14586
14587 use strict;
14588 use warnings;
14589
14590 # If run outside the normal test suite on an ASCII platform, you can
14591 # just create a latin1_to_native() function that just returns its
14592 # inputs, because that's the only function used from test.pl
14593 require "test.pl";
14594
14595 # Test qr/\X/ and the \p{} regular expression constructs.  This file is
14596 # constructed by mktables from the tables it generates, so if mktables is
14597 # buggy, this won't necessarily catch those bugs.  Tests are generated for all
14598 # feasible properties; a few aren't currently feasible; see
14599 # is_code_point_usable() in mktables for details.
14600
14601 # Standard test packages are not used because this manipulates SIG_WARN.  It
14602 # exits 0 if every non-skipped test succeeded; -1 if any failed.
14603
14604 my $Tests = 0;
14605 my $Fails = 0;
14606
14607 sub Expect($$$$) {
14608     my $expected = shift;
14609     my $ord = shift;
14610     my $regex  = shift;
14611     my $warning_type = shift;   # Type of warning message, like 'deprecated'
14612                                 # or empty if none
14613     my $line   = (caller)[2];
14614     $ord = ord(latin1_to_native(chr($ord)));
14615
14616     # Convert the code point to hex form
14617     my $string = sprintf "\"\\x{%04X}\"", $ord;
14618
14619     my @tests = "";
14620
14621     # The first time through, use all warnings.  If the input should generate
14622     # a warning, add another time through with them turned off
14623     push @tests, "no warnings '$warning_type';" if $warning_type;
14624
14625     foreach my $no_warnings (@tests) {
14626
14627         # Store any warning messages instead of outputting them
14628         local $SIG{__WARN__} = $SIG{__WARN__};
14629         my $warning_message;
14630         $SIG{__WARN__} = sub { $warning_message = $_[0] };
14631
14632         $Tests++;
14633
14634         # A string eval is needed because of the 'no warnings'.
14635         # Assumes no parens in the regular expression
14636         my $result = eval "$no_warnings
14637                             my \$RegObj = qr($regex);
14638                             $string =~ \$RegObj ? 1 : 0";
14639         if (not defined $result) {
14640             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
14641             $Fails++;
14642         }
14643         elsif ($result ^ $expected) {
14644             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
14645             $Fails++;
14646         }
14647         elsif ($warning_message) {
14648             if (! $warning_type || ($warning_type && $no_warnings)) {
14649                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
14650                 $Fails++;
14651             }
14652             else {
14653                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
14654             }
14655         }
14656         elsif ($warning_type && ! $no_warnings) {
14657             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
14658             $Fails++;
14659         }
14660         else {
14661             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
14662         }
14663     }
14664     return;
14665 }
14666
14667 sub Error($) {
14668     my $regex  = shift;
14669     $Tests++;
14670     if (eval { 'x' =~ qr/$regex/; 1 }) {
14671         $Fails++;
14672         my $line = (caller)[2];
14673         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
14674     }
14675     else {
14676         my $line = (caller)[2];
14677         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
14678     }
14679     return;
14680 }
14681
14682 # GCBTest.txt character that separates grapheme clusters
14683 my $breakable_utf8 = my $breakable = chr(0xF7);
14684 utf8::upgrade($breakable_utf8);
14685
14686 # GCBTest.txt character that indicates that the adjoining code points are part
14687 # of the same grapheme cluster
14688 my $nobreak_utf8 = my $nobreak = chr(0xD7);
14689 utf8::upgrade($nobreak_utf8);
14690
14691 sub Test_X($) {
14692     # Test qr/\X/ matches.  The input is a line from auxiliary/GCBTest.txt
14693     # Each such line is a sequence of code points given by their hex numbers,
14694     # separated by the two characters defined just before this subroutine that
14695     # indicate that either there can or cannot be a break between the adjacent
14696     # code points.  If there isn't a break, that means the sequence forms an
14697     # extended grapheme cluster, which means that \X should match the whole
14698     # thing.  If there is a break, \X should stop there.  This is all
14699     # converted by this routine into a match:
14700     #   $string =~ /(\X)/,
14701     # Each \X should match the next cluster; and that is what is checked.
14702
14703     my $template = shift;
14704
14705     my $line   = (caller)[2];
14706
14707     # The line contains characters above the ASCII range, but in Latin1.  It
14708     # may or may not be in utf8, and if it is, it may or may not know it.  So,
14709     # convert these characters to 8 bits.  If knows is in utf8, simply
14710     # downgrade.
14711     if (utf8::is_utf8($template)) {
14712         utf8::downgrade($template);
14713     } else {
14714
14715         # Otherwise, if it is in utf8, but doesn't know it, the next lines
14716         # convert the two problematic characters to their 8-bit equivalents.
14717         # If it isn't in utf8, they don't harm anything.
14718         use bytes;
14719         $template =~ s/$nobreak_utf8/$nobreak/g;
14720         $template =~ s/$breakable_utf8/$breakable/g;
14721     }
14722
14723     # Get rid of the leading and trailing breakables
14724     $template =~ s/^ \s* $breakable \s* //x;
14725     $template =~ s/ \s* $breakable \s* $ //x;
14726
14727     # And no-breaks become just a space.
14728     $template =~ s/ \s* $nobreak \s* / /xg;
14729
14730     # Split the input into segments that are breakable between them.
14731     my @segments = split /\s*$breakable\s*/, $template;
14732
14733     my $string = "";
14734     my $display_string = "";
14735     my @should_match;
14736     my @should_display;
14737
14738     # Convert the code point sequence in each segment into a Perl string of
14739     # characters
14740     foreach my $segment (@segments) {
14741         my @code_points = split /\s+/, $segment;
14742         my $this_string = "";
14743         my $this_display = "";
14744         foreach my $code_point (@code_points) {
14745             $this_string .= latin1_to_native(chr(hex $code_point));
14746             $this_display .= "\\x{$code_point}";
14747         }
14748
14749         # The next cluster should match the string in this segment.
14750         push @should_match, $this_string;
14751         push @should_display, $this_display;
14752         $string .= $this_string;
14753         $display_string .= $this_display;
14754     }
14755
14756     # If a string can be represented in both non-ut8 and utf8, test both cases
14757     UPGRADE:
14758     for my $to_upgrade (0 .. 1) {
14759
14760         if ($to_upgrade) {
14761
14762             # If already in utf8, would just be a repeat
14763             next UPGRADE if utf8::is_utf8($string);
14764
14765             utf8::upgrade($string);
14766         }
14767
14768         # Finally, do the \X match.
14769         my @matches = $string =~ /(\X)/g;
14770
14771         # Look through each matched cluster to verify that it matches what we
14772         # expect.
14773         my $min = (@matches < @should_match) ? @matches : @should_match;
14774         for my $i (0 .. $min - 1) {
14775             $Tests++;
14776             if ($matches[$i] eq $should_match[$i]) {
14777                 print "ok $Tests - ";
14778                 if ($i == 0) {
14779                     print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
14780                 } else {
14781                     print "And \\X #", $i + 1,
14782                 }
14783                 print " correctly matched $should_display[$i]; line $line\n";
14784             } else {
14785                 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
14786                                                     unpack("U*", $matches[$i]));
14787                 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
14788                     $i + 1,
14789                     " should have matched $should_display[$i]",
14790                     " but instead matched $matches[$i]",
14791                     ".  Abandoning rest of line $line\n";
14792                 next UPGRADE;
14793             }
14794         }
14795
14796         # And the number of matches should equal the number of expected matches.
14797         $Tests++;
14798         if (@matches == @should_match) {
14799             print "ok $Tests - Nothing was left over; line $line\n";
14800         } else {
14801             print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
14802         }
14803     }
14804
14805     return;
14806 }
14807
14808 sub Finished() {
14809     print "1..$Tests\n";
14810     exit($Fails ? -1 : 0);
14811 }
14812
14813 Error('\p{Script=InGreek}');    # Bug #69018
14814 Test_X("1100 $nobreak 1161");  # Bug #70940
14815 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
14816 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
14817 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726