lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # Needs 'no overloading' to run faster on miniperl.  Code commented out at the
   8 # subroutine objaddr can be used instead to work as far back (untested) as
   9 # 5.8: needs pack "U".  But almost all occurrences of objaddr have been
  10 # removed in favor of using 'no overloading'.  You also would have to go
  11 # through and replace occurrences like:
  12 #       my $addr = do { no overloading; pack 'J', $self; }
  13 # with
  14 #       my $addr = main::objaddr $self;
  15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
  16 # that instituted the change to main::objaddr, and subsequent commits that
  17 # changed 0+$self to pack 'J', $self.)
  18
  19 my $start_time;
  20 BEGIN { # Get the time the script started running; do it at compilation to
  21         # get it as close as possible
  22     $start_time= time;
  23 }
  24
  25 require 5.010_001;
  26 use strict;
  27 use warnings;
  28 use Carp;
  29 use Config;
  30 use File::Find;
  31 use File::Path;
  32 use File::Spec;
  33 use Text::Tabs;
  34 use re "/aa";
  35
  36 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  38
  39 ##########################################################################
  40 #
  41 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  42 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  43 # a pod file and a .t file
  44 #
  45 # The structure of this file is:
  46 #   First these introductory comments; then
  47 #   code needed for everywhere, such as debugging stuff; then
  48 #   code to handle input parameters; then
  49 #   data structures likely to be of external interest (some of which depend on
  50 #       the input parameters, so follows them; then
  51 #   more data structures and subroutine and package (class) definitions; then
  52 #   the small actual loop to process the input files and finish up; then
  53 #   a __DATA__ section, for the .t tests
  54 #
  55 # This program works on all releases of Unicode through at least 6.0.  The
  56 # outputs have been scrutinized most intently for release 5.1.  The others
  57 # have been checked for somewhat more than just sanity.  It can handle all
  58 # existing Unicode character properties in those releases.
  59 #
  60 # This program is mostly about Unicode character (or code point) properties.
  61 # A property describes some attribute or quality of a code point, like if it
  62 # is lowercase or not, its name, what version of Unicode it was first defined
  63 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  64 # possibilities by making all properties into mappings from each code point
  65 # into some corresponding value.  In the case of it being lowercase or not,
  66 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  67 # property maps each Unicode code point to a single value, called a "property
  68 # value".  (Hence each Unicode property is a true mathematical function with
  69 # exactly one value per code point.)
  70 #
  71 # When using a property in a regular expression, what is desired isn't the
  72 # mapping of the code point to its property's value, but the reverse (or the
  73 # mathematical "inverse relation"): starting with the property value, "Does a
  74 # code point map to it?"  These are written in a "compound" form:
  75 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  76 # files containing the lists of code points that map to each such regular
  77 # expression property value, one file per list
  78 #
  79 # There is also a single form shortcut that Perl adds for many of the commonly
  80 # used properties.  This happens for all binary properties, plus script,
  81 # general_category, and block properties.
  82 #
  83 # Thus the outputs of this program are files.  There are map files, mostly in
  84 # the 'To' directory; and there are list files for use in regular expression
  85 # matching, all in subdirectories of the 'lib' directory, with each
  86 # subdirectory being named for the property that the lists in it are for.
  87 # Bookkeeping, test, and documentation files are also generated.
  88
  89 my $matches_directory = 'lib';   # Where match (\p{}) files go.
  90 my $map_directory = 'To';        # Where map files go.
  91
  92 # DATA STRUCTURES
  93 #
  94 # The major data structures of this program are Property, of course, but also
  95 # Table.  There are two kinds of tables, very similar to each other.
  96 # "Match_Table" is the data structure giving the list of code points that have
  97 # a particular property value, mentioned above.  There is also a "Map_Table"
  98 # data structure which gives the property's mapping from code point to value.
  99 # There are two structures because the match tables need to be combined in
 100 # various ways, such as constructing unions, intersections, complements, etc.,
 101 # and the map ones don't.  And there would be problems, perhaps subtle, if
 102 # a map table were inadvertently operated on in some of those ways.
 103 # The use of separate classes with operations defined on one but not the other
 104 # prevents accidentally confusing the two.
 105 #
 106 # At the heart of each table's data structure is a "Range_List", which is just
 107 # an ordered list of "Ranges", plus ancillary information, and methods to
 108 # operate on them.  A Range is a compact way to store property information.
 109 # Each range has a starting code point, an ending code point, and a value that
 110 # is meant to apply to all the code points between the two end points,
 111 # inclusive.  For a map table, this value is the property value for those
 112 # code points.  Two such ranges could be written like this:
 113 #   0x41 .. 0x5A, 'Upper',
 114 #   0x61 .. 0x7A, 'Lower'
 115 #
 116 # Each range also has a type used as a convenience to classify the values.
 117 # Most ranges in this program will be Type 0, or normal, but there are some
 118 # ranges that have a non-zero type.  These are used only in map tables, and
 119 # are for mappings that don't fit into the normal scheme of things.  Mappings
 120 # that require a hash entry to communicate with utf8.c are one example;
 121 # another example is mappings for charnames.pm to use which indicate a name
 122 # that is algorithmically determinable from its code point (and vice-versa).
 123 # These are used to significantly compact these tables, instead of listing
 124 # each one of the tens of thousands individually.
 125 #
 126 # In a match table, the value of a range is irrelevant (and hence the type as
 127 # well, which will always be 0), and arbitrarily set to the null string.
 128 # Using the example above, there would be two match tables for those two
 129 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 130 # named Lower would contain 0x61..0x7A.
 131 #
 132 # Actually, there are two types of range lists, "Range_Map" is the one
 133 # associated with map tables, and "Range_List" with match tables.
 134 # Again, this is so that methods can be defined on one and not the other so as
 135 # to prevent operating on them in incorrect ways.
 136 #
 137 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 138 # in the perl core.  All tables could in theory be written, but some are
 139 # suppressed because there is no current practical use for them.  It is easy
 140 # to change which get written by changing various lists that are near the top
 141 # of the actual code in this file.  The table data structures contain enough
 142 # ancillary information to allow them to be treated as separate entities for
 143 # writing, such as the path to each one's file.  There is a heading in each
 144 # map table that gives the format of its entries, and what the map is for all
 145 # the code points missing from it.  (This allows tables to be more compact.)
 146 #
 147 # The Property data structure contains one or more tables.  All properties
 148 # contain a map table (except the $perl property which is a
 149 # pseudo-property containing only match tables), and any properties that
 150 # are usable in regular expression matches also contain various matching
 151 # tables, one for each value the property can have.  A binary property can
 152 # have two values, True and False (or Y and N, which are preferred by Unicode
 153 # terminology).  Thus each of these properties will have a map table that
 154 # takes every code point and maps it to Y or N (but having ranges cuts the
 155 # number of entries in that table way down), and two match tables, one
 156 # which has a list of all the code points that map to Y, and one for all the
 157 # code points that map to N.  (For each of these, a third table is also
 158 # generated for the pseudo Perl property.  It contains the identical code
 159 # points as the Y table, but can be written, not in the compound form, but in
 160 # a "single" form like \p{IsUppercase}.)  Many properties are binary, but some
 161 # properties have several possible values, some have many, and properties like
 162 # Name have a different value for every named code point.  Those will not,
 163 # unless the controlling lists are changed, have their match tables written
 164 # out.  But all the ones which can be used in regular expression \p{} and \P{}
 165 # constructs will.  Prior to 5.14, generally a property would have either its
 166 # map table or its match tables written but not both.  Again, what gets
 167 # written is controlled by lists which can easily be changed.  Starting in
 168 # 5.14, advantage was taken of this, and all the map tables needed to
 169 # reconstruct the Unicode db are now written out, while suppressing the
 170 # Unicode .txt files that contain the data.  Our tables are much more compact
 171 # than the .txt files, so a significant space savings was achieved.
 172
 173 # Properties have a 'Type', like binary, or string, or enum depending on how
 174 # many match tables there are and the content of the maps.  This 'Type' is
 175 # different than a range 'Type', so don't get confused by the two concepts
 176 # having the same name.
 177 #
 178 # For information about the Unicode properties, see Unicode's UAX44 document:
 179
 180 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 181
 182 # As stated earlier, this program will work on any release of Unicode so far.
 183 # Most obvious problems in earlier data have NOT been corrected except when
 184 # necessary to make Perl or this program work reasonably.  For example, no
 185 # folding information was given in early releases, so this program substitutes
 186 # lower case instead, just so that a regular expression with the /i option
 187 # will do something that actually gives the right results in many cases.
 188 # There are also a couple other corrections for version 1.1.5, commented at
 189 # the point they are made.  As an example of corrections that weren't made
 190 # (but could be) is this statement from DerivedAge.txt: "The supplementary
 191 # private use code points and the non-character code points were assigned in
 192 # version 2.0, but not specifically listed in the UCD until versions 3.0 and
 193 # 3.1 respectively."  (To be precise it was 3.0.1 not 3.0.0) More information
 194 # on Unicode version glitches is further down in these introductory comments.
 195 #
 196 # This program works on all non-provisional properties as of 6.0, though the
 197 # files for some are suppressed from apparent lack of demand for them.  You
 198 # can change which are output by changing lists in this program.
 199 #
 200 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
 201 # loose matchings rules (from Unicode TR18):
 202 #
 203 #    The recommended names for UCD properties and property values are in
 204 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 205 #    [PropValue]. There are both abbreviated names and longer, more
 206 #    descriptive names. It is strongly recommended that both names be
 207 #    recognized, and that loose matching of property names be used,
 208 #    whereby the case distinctions, whitespace, hyphens, and underbar
 209 #    are ignored.
 210 # The program still allows Fuzzy to override its determination of if loose
 211 # matching should be used, but it isn't currently used, as it is no longer
 212 # needed; the calculations it makes are good enough.
 213 #
 214 # SUMMARY OF HOW IT WORKS:
 215 #
 216 #   Process arguments
 217 #
 218 #   A list is constructed containing each input file that is to be processed
 219 #
 220 #   Each file on the list is processed in a loop, using the associated handler
 221 #   code for each:
 222 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 223 #            first.  These files name the properties and property values.
 224 #            Objects are created of all the property and property value names
 225 #            that the rest of the input should expect, including all synonyms.
 226 #        The other input files give mappings from properties to property
 227 #           values.  That is, they list code points and say what the mapping
 228 #           is under the given property.  Some files give the mappings for
 229 #           just one property; and some for many.  This program goes through
 230 #           each file and populates the properties from them.  Some properties
 231 #           are listed in more than one file, and Unicode has set up a
 232 #           precedence as to which has priority if there is a conflict.  Thus
 233 #           the order of processing matters, and this program handles the
 234 #           conflict possibility by processing the overriding input files
 235 #           last, so that if necessary they replace earlier values.
 236 #        After this is all done, the program creates the property mappings not
 237 #            furnished by Unicode, but derivable from what it does give.
 238 #        The tables of code points that match each property value in each
 239 #            property that is accessible by regular expressions are created.
 240 #        The Perl-defined properties are created and populated.  Many of these
 241 #            require data determined from the earlier steps
 242 #        Any Perl-defined synonyms are created, and name clashes between Perl
 243 #            and Unicode are reconciled and warned about.
 244 #        All the properties are written to files
 245 #        Any other files are written, and final warnings issued.
 246 #
 247 # For clarity, a number of operators have been overloaded to work on tables:
 248 #   ~ means invert (take all characters not in the set).  The more
 249 #       conventional '!' is not used because of the possibility of confusing
 250 #       it with the actual boolean operation.
 251 #   + means union
 252 #   - means subtraction
 253 #   & means intersection
 254 # The precedence of these is the order listed.  Parentheses should be
 255 # copiously used.  These are not a general scheme.  The operations aren't
 256 # defined for a number of things, deliberately, to avoid getting into trouble.
 257 # Operations are done on references and affect the underlying structures, so
 258 # that the copy constructors for them have been overloaded to not return a new
 259 # clone, but the input object itself.
 260 #
 261 # The bool operator is deliberately not overloaded to avoid confusion with
 262 # "should it mean if the object merely exists, or also is non-empty?".
 263 #
 264 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 265 #
 266 # This program needs to be able to run under miniperl.  Therefore, it uses a
 267 # minimum of other modules, and hence implements some things itself that could
 268 # be gotten from CPAN
 269 #
 270 # This program uses inputs published by the Unicode Consortium.  These can
 271 # change incompatibly between releases without the Perl maintainers realizing
 272 # it.  Therefore this program is now designed to try to flag these.  It looks
 273 # at the directories where the inputs are, and flags any unrecognized files.
 274 # It keeps track of all the properties in the files it handles, and flags any
 275 # that it doesn't know how to handle.  It also flags any input lines that
 276 # don't match the expected syntax, among other checks.
 277 #
 278 # It is also designed so if a new input file matches one of the known
 279 # templates, one hopefully just needs to add it to a list to have it
 280 # processed.
 281 #
 282 # As mentioned earlier, some properties are given in more than one file.  In
 283 # particular, the files in the extracted directory are supposedly just
 284 # reformattings of the others.  But they contain information not easily
 285 # derivable from the other files, including results for Unihan, which this
 286 # program doesn't ordinarily look at, and for unassigned code points.  They
 287 # also have historically had errors or been incomplete.  In an attempt to
 288 # create the best possible data, this program thus processes them first to
 289 # glean information missing from the other files; then processes those other
 290 # files to override any errors in the extracted ones.  Much of the design was
 291 # driven by this need to store things and then possibly override them.
 292 #
 293 # It tries to keep fatal errors to a minimum, to generate something usable for
 294 # testing purposes.  It always looks for files that could be inputs, and will
 295 # warn about any that it doesn't know how to handle (the -q option suppresses
 296 # the warning).
 297 #
 298 # Why is there more than one type of range?
 299 #   This simplified things.  There are some very specialized code points that
 300 #   have to be handled specially for output, such as Hangul syllable names.
 301 #   By creating a range type (done late in the development process), it
 302 #   allowed this to be stored with the range, and overridden by other input.
 303 #   Originally these were stored in another data structure, and it became a
 304 #   mess trying to decide if a second file that was for the same property was
 305 #   overriding the earlier one or not.
 306 #
 307 # Why are there two kinds of tables, match and map?
 308 #   (And there is a base class shared by the two as well.)  As stated above,
 309 #   they actually are for different things.  Development proceeded much more
 310 #   smoothly when I (khw) realized the distinction.  Map tables are used to
 311 #   give the property value for every code point (actually every code point
 312 #   that doesn't map to a default value).  Match tables are used for regular
 313 #   expression matches, and are essentially the inverse mapping.  Separating
 314 #   the two allows more specialized methods, and error checks so that one
 315 #   can't just take the intersection of two map tables, for example, as that
 316 #   is nonsensical.
 317 #
 318 # DEBUGGING
 319 #
 320 # This program is written so it will run under miniperl.  Occasionally changes
 321 # will cause an error where the backtrace doesn't work well under miniperl.
 322 # To diagnose the problem, you can instead run it under regular perl, if you
 323 # have one compiled.
 324 #
 325 # There is a good trace facility.  To enable it, first sub DEBUG must be set
 326 # to return true.  Then a line like
 327 #
 328 # local $to_trace = 1 if main::DEBUG;
 329 #
 330 # can be added to enable tracing in its lexical scope or until you insert
 331 # another line:
 332 #
 333 # local $to_trace = 0 if main::DEBUG;
 334 #
 335 # then use a line like "trace $a, @b, %c, ...;
 336 #
 337 # Some of the more complex subroutines already have trace statements in them.
 338 # Permanent trace statements should be like:
 339 #
 340 # trace ... if main::DEBUG && $to_trace;
 341 #
 342 # If there is just one or a few files that you're debugging, you can easily
 343 # cause most everything else to be skipped.  Change the line
 344 #
 345 # my $debug_skip = 0;
 346 #
 347 # to 1, and every file whose object is in @input_file_objects and doesn't have
 348 # a, 'non_skip => 1,' in its constructor will be skipped.
 349 #
 350 # To compare the output tables, it may be useful to specify the -annotate
 351 # flag.  This causes the tables to expand so there is one entry for each
 352 # non-algorithmically named code point giving, currently its name, and its
 353 # graphic representation if printable (and you have a font that knows about
 354 # it).  This makes it easier to see what the particular code points are in
 355 # each output table.  The tables are usable, but because they don't have
 356 # ranges (for the most part), a Perl using them will run slower.  Non-named
 357 # code points are annotated with a description of their status, and contiguous
 358 # ones with the same description will be output as a range rather than
 359 # individually.  Algorithmically named characters are also output as ranges,
 360 # except when there are just a few contiguous ones.
 361 #
 362 # FUTURE ISSUES
 363 #
 364 # The program would break if Unicode were to change its names so that
 365 # interior white space, underscores, or dashes differences were significant
 366 # within property and property value names.
 367 #
 368 # It might be easier to use the xml versions of the UCD if this program ever
 369 # would need heavy revision, and the ability to handle old versions was not
 370 # required.
 371 #
 372 # There is the potential for name collisions, in that Perl has chosen names
 373 # that Unicode could decide it also likes.  There have been such collisions in
 374 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 375 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 376 # collisions, which were withdrawn before the final release, because of Perl's
 377 # and other's protests.  These all involved new properties which began with
 378 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 379 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 380 # Unicode document, so they are unlikely to be used by Unicode for another
 381 # purpose.  However, they might try something beginning with 'In', or use any
 382 # of the other Perl-defined properties.  This program will warn you of name
 383 # collisions, and refuse to generate tables with them, but manual intervention
 384 # will be required in this event.  One scheme that could be implemented, if
 385 # necessary, would be to have this program generate another file, or add a
 386 # field to mktables.lst that gives the date of first definition of a property.
 387 # Each new release of Unicode would use that file as a basis for the next
 388 # iteration.  And the Perl synonym addition code could sort based on the age
 389 # of the property, so older properties get priority, and newer ones that clash
 390 # would be refused; hence existing code would not be impacted, and some other
 391 # synonym would have to be used for the new property.  This is ugly, and
 392 # manual intervention would certainly be easier to do in the short run; lets
 393 # hope it never comes to this.
 394 #
 395 # A NOTE ON UNIHAN
 396 #
 397 # This program can generate tables from the Unihan database.  But it doesn't
 398 # by default, letting the CPAN module Unicode::Unihan handle them.  Prior to
 399 # version 5.2, this database was in a single file, Unihan.txt.  In 5.2 the
 400 # database was split into 8 different files, all beginning with the letters
 401 # 'Unihan'.  This program will read those file(s) if present, but it needs to
 402 # know which of the many properties in the file(s) should have tables created
 403 # for them.  It will create tables for any properties listed in
 404 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
 405 # @cjk_properties array and the @cjk_property_values array.  Thus, if a
 406 # property you want is not in those files of the release you are building
 407 # against, you must add it to those two arrays.  Starting in 4.0, the
 408 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
 409 # is present in the directory, a table will be generated for that property.
 410 # In 5.2, several more properties were added.  For your convenience, the two
 411 # arrays are initialized with all the 6.0 listed properties that are also in
 412 # earlier releases.  But these are commented out.  You can just uncomment the
 413 # ones you want, or use them as a template for adding entries for other
 414 # properties.
 415 #
 416 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 417 # and filter_unihan_line() are the functions where this is done.  This program
 418 # already does some adjusting to make the lines look more like the rest of the
 419 # Unicode DB;  You can see what that is in filter_unihan_line()
 420 #
 421 # There is a bug in the 3.2 data file in which some values for the
 422 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 423 # could be added for these; or for a particular installation, the Unihan.txt
 424 # file could be edited to fix them.
 425 #
 426 # HOW TO ADD A FILE TO BE PROCESSED
 427 #
 428 # A new file from Unicode needs to have an object constructed for it in
 429 # @input_file_objects, probably at the end or at the end of the extracted
 430 # ones.  The program should warn you if its name will clash with others on
 431 # restrictive file systems, like DOS.  If so, figure out a better name, and
 432 # add lines to the README.perl file giving that.  If the file is a character
 433 # property, it should be in the format that Unicode has by default
 434 # standardized for such files for the more recently introduced ones.
 435 # If so, the Input_file constructor for @input_file_objects can just be the
 436 # file name and release it first appeared in.  If not, then it should be
 437 # possible to construct an each_line_handler() to massage the line into the
 438 # standardized form.
 439 #
 440 # For non-character properties, more code will be needed.  You can look at
 441 # the existing entries for clues.
 442 #
 443 # UNICODE VERSIONS NOTES
 444 #
 445 # The Unicode UCD has had a number of errors in it over the versions.  And
 446 # these remain, by policy, in the standard for that version.  Therefore it is
 447 # risky to correct them, because code may be expecting the error.  So this
 448 # program doesn't generally make changes, unless the error breaks the Perl
 449 # core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
 450 # for U+1105, which causes real problems for the algorithms for Jamo
 451 # calculations, so it is changed here.
 452 #
 453 # But it isn't so clear cut as to what to do about concepts that are
 454 # introduced in a later release; should they extend back to earlier releases
 455 # where the concept just didn't exist?  It was easier to do this than to not,
 456 # so that's what was done.  For example, the default value for code points not
 457 # in the files for various properties was probably undefined until changed by
 458 # some version.  No_Block for blocks is such an example.  This program will
 459 # assign No_Block even in Unicode versions that didn't have it.  This has the
 460 # benefit that code being written doesn't have to special case earlier
 461 # versions; and the detriment that it doesn't match the Standard precisely for
 462 # the affected versions.
 463 #
 464 # Here are some observations about some of the issues in early versions:
 465 #
 466 # The number of code points in \p{alpha} halved in 2.1.9.  It turns out that
 467 # the reason is that the CJK block starting at 4E00 was removed from PropList,
 468 # and was not put back in until 3.1.0
 469 #
 470 # Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
 471 # always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
 472 # reason is that 3.2 introduced U+205F=medium math space, which was not
 473 # classed as white space, but Perl figured out that it should have been. 4.0
 474 # reclassified it correctly.
 475 #
 476 # Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
 477 # this was erroneously a synonym for 202.  In 4.0, ATB became 202, and ATBL
 478 # was left with no code points, as all the ones that mapped to 202 stayed
 479 # mapped to 202.  Thus if your program used the numeric name for the class,
 480 # it would not have been affected, but if it used the mnemonic, it would have
 481 # been.
 482 #
 483 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that code
 484 # points which eventually came to have this script property value, instead
 485 # mapped to "Unknown".  But in the next release all these code points were
 486 # moved to \p{sc=common} instead.
 487 #
 488 # The default for missing code points for BidiClass is complicated.  Starting
 489 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 490 # tries to do the best it can for earlier releases.  It is done in
 491 # process_PropertyAliases()
 492 #
 493 ##############################################################################
 494
 495 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 496                         # and errors
 497 my $MAX_LINE_WIDTH = 78;
 498
 499 # Debugging aid to skip most files so as to not be distracted by them when
 500 # concentrating on the ones being debugged.  Add
 501 # non_skip => 1,
 502 # to the constructor for those files you want processed when you set this.
 503 # Files with a first version number of 0 are special: they are always
 504 # processed regardless of the state of this flag.  Generally, Jamo.txt and
 505 # UnicodeData.txt must not be skipped if you want this program to not die
 506 # before normal completion.
 507 my $debug_skip = 0;
 508
 509 # Set to 1 to enable tracing.
 510 our $to_trace = 0;
 511
 512 { # Closure for trace: debugging aid
 513     my $print_caller = 1;        # ? Include calling subroutine name
 514     my $main_with_colon = 'main::';
 515     my $main_colon_length = length($main_with_colon);
 516
 517     sub trace {
 518         return unless $to_trace;        # Do nothing if global flag not set
 519
 520         my @input = @_;
 521
 522         local $DB::trace = 0;
 523         $DB::trace = 0;          # Quiet 'used only once' message
 524
 525         my $line_number;
 526
 527         # Loop looking up the stack to get the first non-trace caller
 528         my $caller_line;
 529         my $caller_name;
 530         my $i = 0;
 531         do {
 532             $line_number = $caller_line;
 533             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 534             $caller = $main_with_colon unless defined $caller;
 535
 536             $caller_name = $caller;
 537
 538             # get rid of pkg
 539             $caller_name =~ s/.*:://;
 540             if (substr($caller_name, 0, $main_colon_length)
 541                 eq $main_with_colon)
 542             {
 543                 $caller_name = substr($caller_name, $main_colon_length);
 544             }
 545
 546         } until ($caller_name ne 'trace');
 547
 548         # If the stack was empty, we were called from the top level
 549         $caller_name = 'main' if ($caller_name eq ""
 550                                     || $caller_name eq 'trace');
 551
 552         my $output = "";
 553         foreach my $string (@input) {
 554             #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 555             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 556                 $output .= simple_dumper($string);
 557             }
 558             else {
 559                 $string = "$string" if ref $string;
 560                 $string = $UNDEF unless defined $string;
 561                 chomp $string;
 562                 $string = '""' if $string eq "";
 563                 $output .= " " if $output ne ""
 564                                 && $string ne ""
 565                                 && substr($output, -1, 1) ne " "
 566                                 && substr($string, 0, 1) ne " ";
 567                 $output .= $string;
 568             }
 569         }
 570
 571         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 572         print STDERR "$caller_name: " if $print_caller;
 573         print STDERR $output, "\n";
 574         return;
 575     }
 576 }
 577
 578 # This is for a rarely used development feature that allows you to compare two
 579 # versions of the Unicode standard without having to deal with changes caused
 580 # by the code points introduced in the later version.  Change the 0 to a
 581 # string containing a SINGLE dotted Unicode release number (e.g. "2.1").  Only
 582 # code points introduced in that release and earlier will be used; later ones
 583 # are thrown away.  You use the version number of the earliest one you want to
 584 # compare; then run this program on directory structures containing each
 585 # release, and compare the outputs.  These outputs will therefore include only
 586 # the code points common to both releases, and you can see the changes caused
 587 # just by the underlying release semantic changes.  For versions earlier than
 588 # 3.2, you must copy a version of DAge.txt into the directory.
 589 my $string_compare_versions = DEBUG && 0; #  e.g., "2.1";
 590 my $compare_versions = DEBUG
 591                        && $string_compare_versions
 592                        && pack "C*", split /\./, $string_compare_versions;
 593
 594 sub uniques {
 595     # Returns non-duplicated input values.  From "Perl Best Practices:
 596     # Encapsulated Cleverness".  p. 455 in first edition.
 597
 598     my %seen;
 599     # Arguably this breaks encapsulation, if the goal is to permit multiple
 600     # distinct objects to stringify to the same value, and be interchangeable.
 601     # However, for this program, no two objects stringify identically, and all
 602     # lists passed to this function are either objects or strings. So this
 603     # doesn't affect correctness, but it does give a couple of percent speedup.
 604     no overloading;
 605     return grep { ! $seen{$_}++ } @_;
 606 }
 607
 608 $0 = File::Spec->canonpath($0);
 609
 610 my $make_test_script = 0;      # ? Should we output a test script
 611 my $write_unchanged_files = 0; # ? Should we update the output files even if
 612                                #    we don't think they have changed
 613 my $use_directory = "";        # ? Should we chdir somewhere.
 614 my $pod_directory;             # input directory to store the pod file.
 615 my $pod_file = 'perluniprops';
 616 my $t_path;                     # Path to the .t test file
 617 my $file_list = 'mktables.lst'; # File to store input and output file names.
 618                                # This is used to speed up the build, by not
 619                                # executing the main body of the program if
 620                                # nothing on the list has changed since the
 621                                # previous build
 622 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 623                                # make a list so that when the pumpking is
 624                                # preparing a release, s/he won't have to do
 625                                # special things
 626 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 627                                # in the input.
 628 my $output_range_counts = $debugging_build;   # ? Should we include the number
 629                                               # of code points in ranges in
 630                                               # the output
 631 my $annotate = 0;              # ? Should character names be in the output
 632
 633 # Verbosity levels; 0 is quiet
 634 my $NORMAL_VERBOSITY = 1;
 635 my $PROGRESS = 2;
 636 my $VERBOSE = 3;
 637
 638 my $verbosity = $NORMAL_VERBOSITY;
 639
 640 # Process arguments
 641 while (@ARGV) {
 642     my $arg = shift @ARGV;
 643     if ($arg eq '-v') {
 644         $verbosity = $VERBOSE;
 645     }
 646     elsif ($arg eq '-p') {
 647         $verbosity = $PROGRESS;
 648         $| = 1;     # Flush buffers as we go.
 649     }
 650     elsif ($arg eq '-q') {
 651         $verbosity = 0;
 652     }
 653     elsif ($arg eq '-w') {
 654         $write_unchanged_files = 1; # update the files even if havent changed
 655     }
 656     elsif ($arg eq '-check') {
 657         my $this = shift @ARGV;
 658         my $ok = shift @ARGV;
 659         if ($this ne $ok) {
 660             print "Skipping as check params are not the same.\n";
 661             exit(0);
 662         }
 663     }
 664     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 665         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 666     }
 667     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 668     {
 669         $make_test_script = 1;
 670     }
 671     elsif ($arg eq '-makelist') {
 672         $make_list = 1;
 673     }
 674     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 675         -d $use_directory or croak "Unknown directory '$use_directory'";
 676     }
 677     elsif ($arg eq '-L') {
 678
 679         # Existence not tested until have chdir'd
 680         $file_list = shift;
 681     }
 682     elsif ($arg eq '-globlist') {
 683         $glob_list = 1;
 684     }
 685     elsif ($arg eq '-c') {
 686         $output_range_counts = ! $output_range_counts
 687     }
 688     elsif ($arg eq '-annotate') {
 689         $annotate = 1;
 690         $debugging_build = 1;
 691         $output_range_counts = 1;
 692     }
 693     else {
 694         my $with_c = 'with';
 695         $with_c .= 'out' if $output_range_counts;   # Complements the state
 696         croak <<END;
 697 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 698           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 699           [-check A B ]
 700   -c          : Output comments $with_c number of code points in ranges
 701   -q          : Quiet Mode: Only output serious warnings.
 702   -p          : Set verbosity level to normal plus show progress.
 703   -v          : Set Verbosity level high:  Show progress and non-serious
 704                 warnings
 705   -w          : Write files regardless
 706   -C dir      : Change to this directory before proceeding. All relative paths
 707                 except those specified by the -P and -T options will be done
 708                 with respect to this directory.
 709   -P dir      : Output $pod_file file to directory 'dir'.
 710   -T path     : Create a test script as 'path'; overrides -maketest
 711   -L filelist : Use alternate 'filelist' instead of standard one
 712   -globlist   : Take as input all non-Test *.txt files in current and sub
 713                 directories
 714   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 715                 overrides -T
 716   -makelist   : Rewrite the file list $file_list based on current setup
 717   -annotate   : Output an annotation for each character in the table files;
 718                 useful for debugging mktables, looking at diffs; but is slow,
 719                 memory intensive; resulting tables are usable but are slow and
 720                 very large (and currently fail the Unicode::UCD.t tests).
 721   -check A B  : Executes $0 only if A and B are the same
 722 END
 723     }
 724 }
 725
 726 # Stores the most-recently changed file.  If none have changed, can skip the
 727 # build
 728 my $most_recent = (stat $0)[9];   # Do this before the chdir!
 729
 730 # Change directories now, because need to read 'version' early.
 731 if ($use_directory) {
 732     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 733         $pod_directory = File::Spec->rel2abs($pod_directory);
 734     }
 735     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 736         $t_path = File::Spec->rel2abs($t_path);
 737     }
 738     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 739     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 740         $pod_directory = File::Spec->abs2rel($pod_directory);
 741     }
 742     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 743         $t_path = File::Spec->abs2rel($t_path);
 744     }
 745 }
 746
 747 # Get Unicode version into regular and v-string.  This is done now because
 748 # various tables below get populated based on it.  These tables are populated
 749 # here to be near the top of the file, and so easily seeable by those needing
 750 # to modify things.
 751 open my $VERSION, "<", "version"
 752                     or croak "$0: can't open required file 'version': $!\n";
 753 my $string_version = <$VERSION>;
 754 close $VERSION;
 755 chomp $string_version;
 756 my $v_version = pack "C*", split /\./, $string_version;        # v string
 757
 758 # The following are the complete names of properties with property values that
 759 # are known to not match any code points in some versions of Unicode, but that
 760 # may change in the future so they should be matchable, hence an empty file is
 761 # generated for them.
 762 my @tables_that_may_be_empty = (
 763                                 'Joining_Type=Left_Joining',
 764                                 );
 765 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 766 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 767 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 768                                                     if $v_version ge v4.1.0;
 769 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
 770                                                     if $v_version ge v6.0.0;
 771 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
 772                                                     if $v_version ge v6.1.0;
 773
 774 # The lists below are hashes, so the key is the item in the list, and the
 775 # value is the reason why it is in the list.  This makes generation of
 776 # documentation easier.
 777
 778 my %why_suppressed;  # No file generated for these.
 779
 780 # Files aren't generated for empty extraneous properties.  This is arguable.
 781 # Extraneous properties generally come about because a property is no longer
 782 # used in a newer version of Unicode.  If we generated a file without code
 783 # points, programs that used to work on that property will still execute
 784 # without errors.  It just won't ever match (or will always match, with \P{}).
 785 # This means that the logic is now likely wrong.  I (khw) think its better to
 786 # find this out by getting an error message.  Just move them to the table
 787 # above to change this behavior
 788 my %why_suppress_if_empty_warn_if_not = (
 789
 790    # It is the only property that has ever officially been removed from the
 791    # Standard.  The database never contained any code points for it.
 792    'Special_Case_Condition' => 'Obsolete',
 793
 794    # Apparently never official, but there were code points in some versions of
 795    # old-style PropList.txt
 796    'Non_Break' => 'Obsolete',
 797 );
 798
 799 # These would normally go in the warn table just above, but they were changed
 800 # a long time before this program was written, so warnings about them are
 801 # moot.
 802 if ($v_version gt v3.2.0) {
 803     push @tables_that_may_be_empty,
 804                                 'Canonical_Combining_Class=Attached_Below_Left'
 805 }
 806
 807 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
 808 # unless explicitly added.
 809 if ($v_version ge v5.2.0) {
 810     my $unihan = 'Unihan; remove from list if using Unihan';
 811     foreach my $table (qw (
 812                            kAccountingNumeric
 813                            kOtherNumeric
 814                            kPrimaryNumeric
 815                            kCompatibilityVariant
 816                            kIICore
 817                            kIRG_GSource
 818                            kIRG_HSource
 819                            kIRG_JSource
 820                            kIRG_KPSource
 821                            kIRG_MSource
 822                            kIRG_KSource
 823                            kIRG_TSource
 824                            kIRG_USource
 825                            kIRG_VSource
 826                            kRSUnicode
 827                         ))
 828     {
 829         $why_suppress_if_empty_warn_if_not{$table} = $unihan;
 830     }
 831 }
 832
 833 # Enum values for to_output_map() method in the Map_Table package.
 834 my $EXTERNAL_MAP = 1;
 835 my $INTERNAL_MAP = 2;
 836 my $OUTPUT_ADJUSTED = 3;
 837
 838 # To override computed values for writing the map tables for these properties.
 839 # The default for enum map tables is to write them out, so that the Unicode
 840 # .txt files can be removed, but all the data to compute any property value
 841 # for any code point is available in a more compact form.
 842 my %global_to_output_map = (
 843     # Needed by UCD.pm, but don't want to publicize that it exists, so won't
 844     # get stuck supporting it if things change.  Since it is a STRING
 845     # property, it normally would be listed in the pod, but INTERNAL_MAP
 846     # suppresses that.
 847     Unicode_1_Name => $INTERNAL_MAP,
 848
 849     Present_In => 0,                # Suppress, as easily computed from Age
 850     Block => 0,                     # Suppress, as Blocks.txt is retained.
 851
 852     # Suppress, as mapping can be found instead from the
 853     # Perl_Decomposition_Mapping file
 854     Decomposition_Type => 0,
 855 );
 856
 857 # Properties that this program ignores.
 858 my @unimplemented_properties;
 859
 860 # With this release, it is automatically handled if the Unihan db is
 861 # downloaded
 862 push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
 863
 864 # There are several types of obsolete properties defined by Unicode.  These
 865 # must be hand-edited for every new Unicode release.
 866 my %why_deprecated;  # Generates a deprecated warning message if used.
 867 my %why_stabilized;  # Documentation only
 868 my %why_obsolete;    # Documentation only
 869
 870 {   # Closure
 871     my $simple = 'Perl uses the more complete version of this property';
 872     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 873
 874     my $other_properties = 'other properties';
 875     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 876     my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
 877
 878     %why_deprecated = (
 879         'Grapheme_Link' => 'Deprecated by Unicode:  Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
 880         'Jamo_Short_Name' => $contributory,
 881         'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 882         'Other_Alphabetic' => $contributory,
 883         'Other_Default_Ignorable_Code_Point' => $contributory,
 884         'Other_Grapheme_Extend' => $contributory,
 885         'Other_ID_Continue' => $contributory,
 886         'Other_ID_Start' => $contributory,
 887         'Other_Lowercase' => $contributory,
 888         'Other_Math' => $contributory,
 889         'Other_Uppercase' => $contributory,
 890         'Expands_On_NFC' => $why_no_expand,
 891         'Expands_On_NFD' => $why_no_expand,
 892         'Expands_On_NFKC' => $why_no_expand,
 893         'Expands_On_NFKD' => $why_no_expand,
 894     );
 895
 896     %why_suppressed = (
 897         # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
 898         # contains the same information, but without the algorithmically
 899         # determinable Hangul syllables'.  This file is not published, so it's
 900         # existence is not noted in the comment.
 901         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
 902
 903         'Indic_Matra_Category' => "Provisional",
 904         'Indic_Syllabic_Category' => "Provisional",
 905
 906         # Don't suppress ISO_Comment, as otherwise special handling is needed
 907         # to differentiate between it and gc=c, which can be written as 'isc',
 908         # which is the same characters as ISO_Comment's short name.
 909
 910         'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
 911
 912         'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
 913         'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 914         'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 915         'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 916
 917         FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
 918     );
 919
 920     foreach my $property (
 921
 922             # The following are suppressed because they were made contributory
 923             # or deprecated by Unicode before Perl ever thought about
 924             # supporting them.
 925             'Jamo_Short_Name',
 926             'Grapheme_Link',
 927             'Expands_On_NFC',
 928             'Expands_On_NFD',
 929             'Expands_On_NFKC',
 930             'Expands_On_NFKD',
 931
 932             # The following are suppressed because they have been marked
 933             # as deprecated for a sufficient amount of time
 934             'Other_Alphabetic',
 935             'Other_Default_Ignorable_Code_Point',
 936             'Other_Grapheme_Extend',
 937             'Other_ID_Continue',
 938             'Other_ID_Start',
 939             'Other_Lowercase',
 940             'Other_Math',
 941             'Other_Uppercase',
 942     ) {
 943         $why_suppressed{$property} = $why_deprecated{$property};
 944     }
 945
 946     # Customize the message for all the 'Other_' properties
 947     foreach my $property (keys %why_deprecated) {
 948         next if (my $main_property = $property) !~ s/^Other_//;
 949         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
 950     }
 951 }
 952
 953 if ($v_version ge 4.0.0) {
 954     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
 955     if ($v_version ge 6.0.0) {
 956         $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
 957     }
 958 }
 959 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
 960     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
 961     if ($v_version ge 6.0.0) {
 962         $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
 963     }
 964 }
 965
 966 # Probably obsolete forever
 967 if ($v_version ge v4.1.0) {
 968     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
 969 }
 970 if ($v_version ge v6.0.0) {
 971     $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
 972     $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
 973 }
 974
 975 # This program can create files for enumerated-like properties, such as
 976 # 'Numeric_Type'.  This file would be the same format as for a string
 977 # property, with a mapping from code point to its value, so you could look up,
 978 # for example, the script a code point is in.  But no one so far wants this
 979 # mapping, or they have found another way to get it since this is a new
 980 # feature.  So no file is generated except if it is in this list.
 981 my @output_mapped_properties = split "\n", <<END;
 982 END
 983
 984 # If you are using the Unihan database in a Unicode version before 5.2, you
 985 # need to add the properties that you want to extract from it to this table.
 986 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
 987 # listed, commented out
 988 my @cjk_properties = split "\n", <<'END';
 989 #cjkAccountingNumeric; kAccountingNumeric
 990 #cjkOtherNumeric; kOtherNumeric
 991 #cjkPrimaryNumeric; kPrimaryNumeric
 992 #cjkCompatibilityVariant; kCompatibilityVariant
 993 #cjkIICore ; kIICore
 994 #cjkIRG_GSource; kIRG_GSource
 995 #cjkIRG_HSource; kIRG_HSource
 996 #cjkIRG_JSource; kIRG_JSource
 997 #cjkIRG_KPSource; kIRG_KPSource
 998 #cjkIRG_KSource; kIRG_KSource
 999 #cjkIRG_TSource; kIRG_TSource
1000 #cjkIRG_USource; kIRG_USource
1001 #cjkIRG_VSource; kIRG_VSource
1002 #cjkRSUnicode; kRSUnicode                ; Unicode_Radical_Stroke; URS
1003 END
1004
1005 # Similarly for the property values.  For your convenience, the lines in the
1006 # 6.0 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
1007 # '#' marks (for Unicode versions before 5.2)
1008 my @cjk_property_values = split "\n", <<'END';
1009 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1010 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1011 ## @missing: 0000..10FFFF; cjkIICore; <none>
1012 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1013 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1014 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1015 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1016 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1017 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1018 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1019 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1020 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1021 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1022 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1023 END
1024
1025 # The input files don't list every code point.  Those not listed are to be
1026 # defaulted to some value.  Below are hard-coded what those values are for
1027 # non-binary properties as of 5.1.  Starting in 5.0, there are
1028 # machine-parsable comment lines in the files the give the defaults; so this
1029 # list shouldn't have to be extended.  The claim is that all missing entries
1030 # for binary properties will default to 'N'.  Unicode tried to change that in
1031 # 5.2, but the beta period produced enough protest that they backed off.
1032 #
1033 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1034 # be in the form that it expects.  The others may be synonyms.
1035 my $CODE_POINT = '<code point>';
1036 my %default_mapping = (
1037     Age => "Unassigned",
1038     # Bidi_Class => Complicated; set in code
1039     Bidi_Mirroring_Glyph => "",
1040     Block => 'No_Block',
1041     Canonical_Combining_Class => 0,
1042     Case_Folding => $CODE_POINT,
1043     Decomposition_Mapping => $CODE_POINT,
1044     Decomposition_Type => 'None',
1045     East_Asian_Width => "Neutral",
1046     FC_NFKC_Closure => $CODE_POINT,
1047     General_Category => 'Cn',
1048     Grapheme_Cluster_Break => 'Other',
1049     Hangul_Syllable_Type => 'NA',
1050     ISO_Comment => "",
1051     Jamo_Short_Name => "",
1052     Joining_Group => "No_Joining_Group",
1053     # Joining_Type => Complicated; set in code
1054     kIICore => 'N',   #                       Is converted to binary
1055     #Line_Break => Complicated; set in code
1056     Lowercase_Mapping => $CODE_POINT,
1057     Name => "",
1058     Name_Alias => "",
1059     NFC_QC => 'Yes',
1060     NFD_QC => 'Yes',
1061     NFKC_QC => 'Yes',
1062     NFKD_QC => 'Yes',
1063     Numeric_Type => 'None',
1064     Numeric_Value => 'NaN',
1065     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1066     Sentence_Break => 'Other',
1067     Simple_Case_Folding => $CODE_POINT,
1068     Simple_Lowercase_Mapping => $CODE_POINT,
1069     Simple_Titlecase_Mapping => $CODE_POINT,
1070     Simple_Uppercase_Mapping => $CODE_POINT,
1071     Titlecase_Mapping => $CODE_POINT,
1072     Unicode_1_Name => "",
1073     Unicode_Radical_Stroke => "",
1074     Uppercase_Mapping => $CODE_POINT,
1075     Word_Break => 'Other',
1076 );
1077
1078 # Below are files that Unicode furnishes, but this program ignores, and why
1079 my %ignored_files = (
1080     'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1081     'Index.txt' => 'Alphabetical index of Unicode characters',
1082     'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1083     'NamesList.txt' => 'Annotated list of characters',
1084     'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1085     'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1086     'ReadMe.txt' => 'Documentation',
1087     'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized.  This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1088     'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1089     'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1090     'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1091     'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1092     'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
1093 );
1094
1095 my %skipped_files;  # List of files that we skip
1096
1097 ### End of externally interesting definitions, except for @input_file_objects
1098
1099 my $HEADER=<<"EOF";
1100 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1101 # This file is machine-generated by $0 from the Unicode
1102 # database, Version $string_version.  Any changes made here will be lost!
1103 EOF
1104
1105 my $INTERNAL_ONLY_HEADER = <<"EOF";
1106
1107 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1108 # This file is for internal use by core Perl only.  The format and even the
1109 # name or existence of this file are subject to change without notice.  Don't
1110 # use it directly.
1111 EOF
1112
1113 my $DEVELOPMENT_ONLY=<<"EOF";
1114 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1115 # This file contains information artificially constrained to code points
1116 # present in Unicode release $string_compare_versions.
1117 # IT CANNOT BE RELIED ON.  It is for use during development only and should
1118 # not be used for production.
1119
1120 EOF
1121
1122 my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1123 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1124 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1125
1126 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
1127 # two must be 10; if there are 5, the first must not be a 0.  Written this way
1128 # to decrease backtracking.  The first regex allows the code point to be at
1129 # the end of a word, but to work properly, the word shouldn't end with a valid
1130 # hex character.  The second one won't match a code point at the end of a
1131 # word, and doesn't have the run-on issue
1132 my $run_on_code_point_re =
1133             qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1134 my $code_point_re = qr/\b$run_on_code_point_re/;
1135
1136 # This matches the beginning of the line in the Unicode db files that give the
1137 # defaults for code points not listed (i.e., missing) in the file.  The code
1138 # depends on this ending with a semi-colon, so it can assume it is a valid
1139 # field when the line is split() by semi-colons
1140 my $missing_defaults_prefix =
1141             qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
1142
1143 # Property types.  Unicode has more types, but these are sufficient for our
1144 # purposes.
1145 my $UNKNOWN = -1;   # initialized to illegal value
1146 my $NON_STRING = 1; # Either binary or enum
1147 my $BINARY = 2;
1148 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1149                        # tables, additional true and false tables are
1150                        # generated so that false is anything matching the
1151                        # default value, and true is everything else.
1152 my $ENUM = 4;       # Include catalog
1153 my $STRING = 5;     # Anything else: string or misc
1154
1155 # Some input files have lines that give default values for code points not
1156 # contained in the file.  Sometimes these should be ignored.
1157 my $NO_DEFAULTS = 0;        # Must evaluate to false
1158 my $NOT_IGNORED = 1;
1159 my $IGNORED = 2;
1160
1161 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
1162 # and will appear in the main body of the tables in the output files, but
1163 # there are other types of ranges as well, listed below, that are specially
1164 # handled.   There are pseudo-types as well that will never be stored as a
1165 # type, but will affect the calculation of the type.
1166
1167 # 0 is for normal, non-specials
1168 my $MULTI_CP = 1;           # Sequence of more than code point
1169 my $HANGUL_SYLLABLE = 2;
1170 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1171 my $NULL = 4;               # The map is to the null string; utf8.c can't
1172                             # handle these, nor is there an accepted syntax
1173                             # for them in \p{} constructs
1174 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1175                              # otherwise be $MULTI_CP type are instead type 0
1176
1177 # process_generic_property_file() can accept certain overrides in its input.
1178 # Each of these must begin AND end with $CMD_DELIM.
1179 my $CMD_DELIM = "\a";
1180 my $REPLACE_CMD = 'replace';    # Override the Replace
1181 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1182
1183 my $NO = 0;
1184 my $YES = 1;
1185
1186 # Values for the Replace argument to add_range.
1187 # $NO                      # Don't replace; add only the code points not
1188                            # already present.
1189 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1190                            # the comments at the subroutine definition.
1191 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1192 my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1193                            # already there
1194 my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1195                            # already there
1196 my $CROAK = 6;             # Die with an error if is already there
1197
1198 # Flags to give property statuses.  The phrases are to remind maintainers that
1199 # if the flag is changed, the indefinite article referring to it in the
1200 # documentation may need to be as well.
1201 my $NORMAL = "";
1202 my $DEPRECATED = 'D';
1203 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1204 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1205 my $DISCOURAGED = 'X';
1206 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1207 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1208 my $STRICTER = 'T';
1209 my $a_bold_stricter = "a 'B<$STRICTER>'";
1210 my $A_bold_stricter = "A 'B<$STRICTER>'";
1211 my $STABILIZED = 'S';
1212 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1213 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1214 my $OBSOLETE = 'O';
1215 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1216 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1217
1218 my %status_past_participles = (
1219     $DISCOURAGED => 'discouraged',
1220     $STABILIZED => 'stabilized',
1221     $OBSOLETE => 'obsolete',
1222     $DEPRECATED => 'deprecated',
1223 );
1224
1225 # Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1226 # externally documented.
1227 my $ORDINARY = 0;       # The normal fate.
1228 my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1229                         # but there is a file written that can be used to
1230                         # reconstruct this table
1231 my $SUPPRESSED = 3;     # The file for this table is not written out.
1232 my $INTERNAL_ONLY = 4;  # The file for this table is written out, but it is
1233                         # for Perl's internal use only
1234 my $PLACEHOLDER = 5;    # A property that is defined as a placeholder in a
1235                         # Unicode version that doesn't have it, but we need it
1236                         # to be defined, if empty, to have things work.
1237                         # Implies no pod entry generated
1238
1239 # The format of the values of the tables:
1240 my $EMPTY_FORMAT = "";
1241 my $BINARY_FORMAT = 'b';
1242 my $DECIMAL_FORMAT = 'd';
1243 my $FLOAT_FORMAT = 'f';
1244 my $INTEGER_FORMAT = 'i';
1245 my $HEX_FORMAT = 'x';
1246 my $RATIONAL_FORMAT = 'r';
1247 my $STRING_FORMAT = 's';
1248 my $ADJUST_FORMAT = 'a';
1249 my $DECOMP_STRING_FORMAT = 'c';
1250 my $STRING_WHITE_SPACE_LIST = 'sw';
1251
1252 my %map_table_formats = (
1253     $BINARY_FORMAT => 'binary',
1254     $DECIMAL_FORMAT => 'single decimal digit',
1255     $FLOAT_FORMAT => 'floating point number',
1256     $INTEGER_FORMAT => 'integer',
1257     $HEX_FORMAT => 'non-negative hex whole number; a code point',
1258     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1259     $STRING_FORMAT => 'string',
1260     $ADJUST_FORMAT => 'some entries need adjustment',
1261     $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1262     $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1263 );
1264
1265 # Unicode didn't put such derived files in a separate directory at first.
1266 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1267 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1268 my $AUXILIARY = 'auxiliary';
1269
1270 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1271 # and into UCD.pl for the use of UCD.pm
1272 my %loose_to_file_of;       # loosely maps table names to their respective
1273                             # files
1274 my %stricter_to_file_of;    # same; but for stricter mapping.
1275 my %loose_property_to_file_of; # Maps a loose property name to its map file
1276 my %file_to_swash_name;     # Maps the file name to its corresponding key name
1277                             # in the hash %utf8::SwashInfo
1278 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1279                              # their rational equivalent
1280 my %loose_property_name_of; # Loosely maps (non_string) property names to
1281                             # standard form
1282 my %string_property_loose_to_name; # Same, for string properties.
1283 my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1284                             # the property name in standard loose form, and
1285                             # 'value' is the default value for that property,
1286                             # also in standard loose form.
1287 my %loose_to_standard_value; # loosely maps table names to the canonical
1288                             # alias for them
1289 my %ambiguous_names;        # keys are alias names (in standard form) that
1290                             # have more than one possible meaning.
1291 my %prop_aliases;           # Keys are standard property name; values are each
1292                             # one's aliases
1293 my %prop_value_aliases;     # Keys of top level are standard property name;
1294                             # values are keys to another hash,  Each one is
1295                             # one of the property's values, in standard form.
1296                             # The values are that prop-val's aliases.
1297 my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1298
1299 # Most properties are immune to caseless matching, otherwise you would get
1300 # nonsensical results, as properties are a function of a code point, not
1301 # everything that is caselessly equivalent to that code point.  For example,
1302 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1303 # be true because 's' and 'S' are equivalent caselessly.  However,
1304 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1305 # extend that concept to those very few properties that are like this.  Each
1306 # such property will match the full range caselessly.  They are hard-coded in
1307 # the program; it's not worth trying to make it general as it's extremely
1308 # unlikely that they will ever change.
1309 my %caseless_equivalent_to;
1310
1311 # These constants names and values were taken from the Unicode standard,
1312 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1313 # syllables.  The '_string' versions are so generated tables can retain the
1314 # hex format, which is the more familiar value
1315 my $SBase_string = "0xAC00";
1316 my $SBase = CORE::hex $SBase_string;
1317 my $LBase_string = "0x1100";
1318 my $LBase = CORE::hex $LBase_string;
1319 my $VBase_string = "0x1161";
1320 my $VBase = CORE::hex $VBase_string;
1321 my $TBase_string = "0x11A7";
1322 my $TBase = CORE::hex $TBase_string;
1323 my $SCount = 11172;
1324 my $LCount = 19;
1325 my $VCount = 21;
1326 my $TCount = 28;
1327 my $NCount = $VCount * $TCount;
1328
1329 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1330 # with the above published constants.
1331 my %Jamo;
1332 my %Jamo_L;     # Leading consonants
1333 my %Jamo_V;     # Vowels
1334 my %Jamo_T;     # Trailing consonants
1335
1336 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1337 # The key is the base name of the code point, and the value is an
1338 # array giving all the ranges that use this base name.  Each range
1339 # is actually a hash giving the 'low' and 'high' values of it.
1340 my %names_ending_in_code_point;
1341 my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1342                                         # removed from the names
1343 # Inverse mapping.  The list of ranges that have these kinds of
1344 # names.  Each element contains the low, high, and base names in an
1345 # anonymous hash.
1346 my @code_points_ending_in_code_point;
1347
1348 # Boolean: does this Unicode version have the hangul syllables, and are we
1349 # writing out a table for them?
1350 my $has_hangul_syllables = 0;
1351
1352 # Does this Unicode version have code points whose names end in their
1353 # respective code points, and are we writing out a table for them?  0 for no;
1354 # otherwise points to first property that a table is needed for them, so that
1355 # if multiple tables are needed, we don't create duplicates
1356 my $needing_code_points_ending_in_code_point = 0;
1357
1358 my @backslash_X_tests;     # List of tests read in for testing \X
1359 my @unhandled_properties;  # Will contain a list of properties found in
1360                            # the input that we didn't process.
1361 my @match_properties;      # Properties that have match tables, to be
1362                            # listed in the pod
1363 my @map_properties;        # Properties that get map files written
1364 my @named_sequences;       # NamedSequences.txt contents.
1365 my %potential_files;       # Generated list of all .txt files in the directory
1366                            # structure so we can warn if something is being
1367                            # ignored.
1368 my @files_actually_output; # List of files we generated.
1369 my @more_Names;            # Some code point names are compound; this is used
1370                            # to store the extra components of them.
1371 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1372                            # the minimum before we consider it equivalent to a
1373                            # candidate rational
1374 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1375
1376 # These store references to certain commonly used property objects
1377 my $gc;
1378 my $perl;
1379 my $block;
1380 my $perl_charname;
1381 my $print;
1382 my $Any;
1383 my $script;
1384
1385 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1386 my $has_In_conflicts = 0;
1387 my $has_Is_conflicts = 0;
1388
1389 sub internal_file_to_platform ($) {
1390     # Convert our file paths which have '/' separators to those of the
1391     # platform.
1392
1393     my $file = shift;
1394     return undef unless defined $file;
1395
1396     return File::Spec->join(split '/', $file);
1397 }
1398
1399 sub file_exists ($) {   # platform independent '-e'.  This program internally
1400                         # uses slash as a path separator.
1401     my $file = shift;
1402     return 0 if ! defined $file;
1403     return -e internal_file_to_platform($file);
1404 }
1405
1406 sub objaddr($) {
1407     # Returns the address of the blessed input object.
1408     # It doesn't check for blessedness because that would do a string eval
1409     # every call, and the program is structured so that this is never called
1410     # for a non-blessed object.
1411
1412     no overloading; # If overloaded, numifying below won't work.
1413
1414     # Numifying a ref gives its address.
1415     return pack 'J', $_[0];
1416 }
1417
1418 # These are used only if $annotate is true.
1419 # The entire range of Unicode characters is examined to populate these
1420 # after all the input has been processed.  But most can be skipped, as they
1421 # have the same descriptive phrases, such as being unassigned
1422 my @viacode;            # Contains the 1 million character names
1423 my @printable;          # boolean: And are those characters printable?
1424 my @annotate_char_type; # Contains a type of those characters, specifically
1425                         # for the purposes of annotation.
1426 my $annotate_ranges;    # A map of ranges of code points that have the same
1427                         # name for the purposes of annotation.  They map to the
1428                         # upper edge of the range, so that the end point can
1429                         # be immediately found.  This is used to skip ahead to
1430                         # the end of a range, and avoid processing each
1431                         # individual code point in it.
1432 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1433                                    # characters, but excluding those which are
1434                                    # also noncharacter code points
1435
1436 # The annotation types are an extension of the regular range types, though
1437 # some of the latter are folded into one.  Make the new types negative to
1438 # avoid conflicting with the regular types
1439 my $SURROGATE_TYPE = -1;
1440 my $UNASSIGNED_TYPE = -2;
1441 my $PRIVATE_USE_TYPE = -3;
1442 my $NONCHARACTER_TYPE = -4;
1443 my $CONTROL_TYPE = -5;
1444 my $UNKNOWN_TYPE = -6;  # Used only if there is a bug in this program
1445
1446 sub populate_char_info ($) {
1447     # Used only with the $annotate option.  Populates the arrays with the
1448     # input code point's info that are needed for outputting more detailed
1449     # comments.  If calling context wants a return, it is the end point of
1450     # any contiguous range of characters that share essentially the same info
1451
1452     my $i = shift;
1453     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1454
1455     $viacode[$i] = $perl_charname->value_of($i) || "";
1456
1457     # A character is generally printable if Unicode says it is,
1458     # but below we make sure that most Unicode general category 'C' types
1459     # aren't.
1460     $printable[$i] = $print->contains($i);
1461
1462     $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1463
1464     # Only these two regular types are treated specially for annotations
1465     # purposes
1466     $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1467                                 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1468
1469     # Give a generic name to all code points that don't have a real name.
1470     # We output ranges, if applicable, for these.  Also calculate the end
1471     # point of the range.
1472     my $end;
1473     if (! $viacode[$i]) {
1474         if ($gc-> table('Surrogate')->contains($i)) {
1475             $viacode[$i] = 'Surrogate';
1476             $annotate_char_type[$i] = $SURROGATE_TYPE;
1477             $printable[$i] = 0;
1478             $end = $gc->table('Surrogate')->containing_range($i)->end;
1479         }
1480         elsif ($gc-> table('Private_use')->contains($i)) {
1481             $viacode[$i] = 'Private Use';
1482             $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1483             $printable[$i] = 0;
1484             $end = $gc->table('Private_Use')->containing_range($i)->end;
1485         }
1486         elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1487                                                                 contains($i))
1488         {
1489             $viacode[$i] = 'Noncharacter';
1490             $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1491             $printable[$i] = 0;
1492             $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1493                                                     containing_range($i)->end;
1494         }
1495         elsif ($gc-> table('Control')->contains($i)) {
1496             $viacode[$i] = 'Control';
1497             $annotate_char_type[$i] = $CONTROL_TYPE;
1498             $printable[$i] = 0;
1499             $end = 0x81 if $i == 0x80;  # Hard-code this one known case
1500         }
1501         elsif ($gc-> table('Unassigned')->contains($i)) {
1502             $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1503             $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1504             $printable[$i] = 0;
1505
1506             # Because we name the unassigned by the blocks they are in, it
1507             # can't go past the end of that block, and it also can't go past
1508             # the unassigned range it is in.  The special table makes sure
1509             # that the non-characters, which are unassigned, are separated
1510             # out.
1511             $end = min($block->containing_range($i)->end,
1512                        $unassigned_sans_noncharacters-> containing_range($i)->
1513                                                                          end);
1514         }
1515         else {
1516             Carp::my_carp_bug("Can't figure out how to annotate "
1517                               . sprintf("U+%04X", $i)
1518                               . ".  Proceeding anyway.");
1519             $viacode[$i] = 'UNKNOWN';
1520             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1521             $printable[$i] = 0;
1522         }
1523     }
1524
1525     # Here, has a name, but if it's one in which the code point number is
1526     # appended to the name, do that.
1527     elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1528         $viacode[$i] .= sprintf("-%04X", $i);
1529         $end = $perl_charname->containing_range($i)->end;
1530     }
1531
1532     # And here, has a name, but if it's a hangul syllable one, replace it with
1533     # the correct name from the Unicode algorithm
1534     elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1535         use integer;
1536         my $SIndex = $i - $SBase;
1537         my $L = $LBase + $SIndex / $NCount;
1538         my $V = $VBase + ($SIndex % $NCount) / $TCount;
1539         my $T = $TBase + $SIndex % $TCount;
1540         $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1541         $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1542         $end = $perl_charname->containing_range($i)->end;
1543     }
1544
1545     return if ! defined wantarray;
1546     return $i if ! defined $end;    # If not a range, return the input
1547
1548     # Save this whole range so can find the end point quickly
1549     $annotate_ranges->add_map($i, $end, $end);
1550
1551     return $end;
1552 }
1553
1554 # Commented code below should work on Perl 5.8.
1555 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1556 ## the native perl version of it (which is what would operate under miniperl)
1557 ## is extremely slow, as it does a string eval every call.
1558 #my $has_fast_scalar_util = $\18 !~ /miniperl/
1559 #                            && defined eval "require Scalar::Util";
1560 #
1561 #sub objaddr($) {
1562 #    # Returns the address of the blessed input object.  Uses the XS version if
1563 #    # available.  It doesn't check for blessedness because that would do a
1564 #    # string eval every call, and the program is structured so that this is
1565 #    # never called for a non-blessed object.
1566 #
1567 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1568 #
1569 #    # Check at least that is a ref.
1570 #    my $pkg = ref($_[0]) or return undef;
1571 #
1572 #    # Change to a fake package to defeat any overloaded stringify
1573 #    bless $_[0], 'main::Fake';
1574 #
1575 #    # Numifying a ref gives its address.
1576 #    my $addr = pack 'J', $_[0];
1577 #
1578 #    # Return to original class
1579 #    bless $_[0], $pkg;
1580 #    return $addr;
1581 #}
1582
1583 sub max ($$) {
1584     my $a = shift;
1585     my $b = shift;
1586     return $a if $a >= $b;
1587     return $b;
1588 }
1589
1590 sub min ($$) {
1591     my $a = shift;
1592     my $b = shift;
1593     return $a if $a <= $b;
1594     return $b;
1595 }
1596
1597 sub clarify_number ($) {
1598     # This returns the input number with underscores inserted every 3 digits
1599     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1600     # checked.
1601
1602     my $number = shift;
1603     my $pos = length($number) - 3;
1604     return $number if $pos <= 1;
1605     while ($pos > 0) {
1606         substr($number, $pos, 0) = '_';
1607         $pos -= 3;
1608     }
1609     return $number;
1610 }
1611
1612
1613 package Carp;
1614
1615 # These routines give a uniform treatment of messages in this program.  They
1616 # are placed in the Carp package to cause the stack trace to not include them,
1617 # although an alternative would be to use another package and set @CARP_NOT
1618 # for it.
1619
1620 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1621
1622 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1623 # and overload trying to load Scalar:Util under miniperl.  See
1624 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1625 undef $overload::VERSION;
1626
1627 sub my_carp {
1628     my $message = shift || "";
1629     my $nofold = shift || 0;
1630
1631     if ($message) {
1632         $message = main::join_lines($message);
1633         $message =~ s/^$0: *//;     # Remove initial program name
1634         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1635         $message = "\n$0: $message;";
1636
1637         # Fold the message with program name, semi-colon end punctuation
1638         # (which looks good with the message that carp appends to it), and a
1639         # hanging indent for continuation lines.
1640         $message = main::simple_fold($message, "", 4) unless $nofold;
1641         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1642                                     # appends is to the same line
1643     }
1644
1645     return $message if defined wantarray;   # If a caller just wants the msg
1646
1647     carp $message;
1648     return;
1649 }
1650
1651 sub my_carp_bug {
1652     # This is called when it is clear that the problem is caused by a bug in
1653     # this program.
1654
1655     my $message = shift;
1656     $message =~ s/^$0: *//;
1657     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1658     carp $message;
1659     return;
1660 }
1661
1662 sub carp_too_few_args {
1663     if (@_ != 2) {
1664         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1665         return;
1666     }
1667
1668     my $args_ref = shift;
1669     my $count = shift;
1670
1671     my_carp_bug("Need at least $count arguments to "
1672         . (caller 1)[3]
1673         . ".  Instead got: '"
1674         . join ', ', @$args_ref
1675         . "'.  No action taken.");
1676     return;
1677 }
1678
1679 sub carp_extra_args {
1680     my $args_ref = shift;
1681     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1682
1683     unless (ref $args_ref) {
1684         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1685         return;
1686     }
1687     my ($package, $file, $line) = caller;
1688     my $subroutine = (caller 1)[3];
1689
1690     my $list;
1691     if (ref $args_ref eq 'HASH') {
1692         foreach my $key (keys %$args_ref) {
1693             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1694         }
1695         $list = join ', ', each %{$args_ref};
1696     }
1697     elsif (ref $args_ref eq 'ARRAY') {
1698         foreach my $arg (@$args_ref) {
1699             $arg = $UNDEF unless defined $arg;
1700         }
1701         $list = join ', ', @$args_ref;
1702     }
1703     else {
1704         my_carp_bug("Can't cope with ref "
1705                 . ref($args_ref)
1706                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1707         return;
1708     }
1709
1710     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1711     return;
1712 }
1713
1714 package main;
1715
1716 { # Closure
1717
1718     # This program uses the inside-out method for objects, as recommended in
1719     # "Perl Best Practices".  This closure aids in generating those.  There
1720     # are two routines.  setup_package() is called once per package to set
1721     # things up, and then set_access() is called for each hash representing a
1722     # field in the object.  These routines arrange for the object to be
1723     # properly destroyed when no longer used, and for standard accessor
1724     # functions to be generated.  If you need more complex accessors, just
1725     # write your own and leave those accesses out of the call to set_access().
1726     # More details below.
1727
1728     my %constructor_fields; # fields that are to be used in constructors; see
1729                             # below
1730
1731     # The values of this hash will be the package names as keys to other
1732     # hashes containing the name of each field in the package as keys, and
1733     # references to their respective hashes as values.
1734     my %package_fields;
1735
1736     sub setup_package {
1737         # Sets up the package, creating standard DESTROY and dump methods
1738         # (unless already defined).  The dump method is used in debugging by
1739         # simple_dumper().
1740         # The optional parameters are:
1741         #   a)  a reference to a hash, that gets populated by later
1742         #       set_access() calls with one of the accesses being
1743         #       'constructor'.  The caller can then refer to this, but it is
1744         #       not otherwise used by these two routines.
1745         #   b)  a reference to a callback routine to call during destruction
1746         #       of the object, before any fields are actually destroyed
1747
1748         my %args = @_;
1749         my $constructor_ref = delete $args{'Constructor_Fields'};
1750         my $destroy_callback = delete $args{'Destroy_Callback'};
1751         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1752
1753         my %fields;
1754         my $package = (caller)[0];
1755
1756         $package_fields{$package} = \%fields;
1757         $constructor_fields{$package} = $constructor_ref;
1758
1759         unless ($package->can('DESTROY')) {
1760             my $destroy_name = "${package}::DESTROY";
1761             no strict "refs";
1762
1763             # Use typeglob to give the anonymous subroutine the name we want
1764             *$destroy_name = sub {
1765                 my $self = shift;
1766                 my $addr = do { no overloading; pack 'J', $self; };
1767
1768                 $self->$destroy_callback if $destroy_callback;
1769                 foreach my $field (keys %{$package_fields{$package}}) {
1770                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1771                     delete $package_fields{$package}{$field}{$addr};
1772                 }
1773                 return;
1774             }
1775         }
1776
1777         unless ($package->can('dump')) {
1778             my $dump_name = "${package}::dump";
1779             no strict "refs";
1780             *$dump_name = sub {
1781                 my $self = shift;
1782                 return dump_inside_out($self, $package_fields{$package}, @_);
1783             }
1784         }
1785         return;
1786     }
1787
1788     sub set_access {
1789         # Arrange for the input field to be garbage collected when no longer
1790         # needed.  Also, creates standard accessor functions for the field
1791         # based on the optional parameters-- none if none of these parameters:
1792         #   'addable'    creates an 'add_NAME()' accessor function.
1793         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1794         #                function.
1795         #   'settable'   creates a 'set_NAME()' accessor function.
1796         #   'constructor' doesn't create an accessor function, but adds the
1797         #                field to the hash that was previously passed to
1798         #                setup_package();
1799         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1800         # 'add' etc. all mean 'addable'.
1801         # The read accessor function will work on both array and scalar
1802         # values.  If another accessor in the parameter list is 'a', the read
1803         # access assumes an array.  You can also force it to be array access
1804         # by specifying 'readable_array' instead of 'readable'
1805         #
1806         # A sort-of 'protected' access can be set-up by preceding the addable,
1807         # readable or settable with some initial portion of 'protected_' (but,
1808         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1809         # "protection" is only by convention.  All that happens is that the
1810         # accessor functions' names begin with an underscore.  So instead of
1811         # calling set_foo, the call is _set_foo.  (Real protection could be
1812         # accomplished by having a new subroutine, end_package, called at the
1813         # end of each package, and then storing the __LINE__ ranges and
1814         # checking them on every accessor.  But that is way overkill.)
1815
1816         # We create anonymous subroutines as the accessors and then use
1817         # typeglobs to assign them to the proper package and name
1818
1819         my $name = shift;   # Name of the field
1820         my $field = shift;  # Reference to the inside-out hash containing the
1821                             # field
1822
1823         my $package = (caller)[0];
1824
1825         if (! exists $package_fields{$package}) {
1826             croak "$0: Must call 'setup_package' before 'set_access'";
1827         }
1828
1829         # Stash the field so DESTROY can get it.
1830         $package_fields{$package}{$name} = $field;
1831
1832         # Remaining arguments are the accessors.  For each...
1833         foreach my $access (@_) {
1834             my $access = lc $access;
1835
1836             my $protected = "";
1837
1838             # Match the input as far as it goes.
1839             if ($access =~ /^(p[^_]*)_/) {
1840                 $protected = $1;
1841                 if (substr('protected_', 0, length $protected)
1842                     eq $protected)
1843                 {
1844
1845                     # Add 1 for the underscore not included in $protected
1846                     $access = substr($access, length($protected) + 1);
1847                     $protected = '_';
1848                 }
1849                 else {
1850                     $protected = "";
1851                 }
1852             }
1853
1854             if (substr('addable', 0, length $access) eq $access) {
1855                 my $subname = "${package}::${protected}add_$name";
1856                 no strict "refs";
1857
1858                 # add_ accessor.  Don't add if already there, which we
1859                 # determine using 'eq' for scalars and '==' otherwise.
1860                 *$subname = sub {
1861                     use strict "refs";
1862                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1863                     my $self = shift;
1864                     my $value = shift;
1865                     my $addr = do { no overloading; pack 'J', $self; };
1866                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1867                     if (ref $value) {
1868                         return if grep { $value == $_ } @{$field->{$addr}};
1869                     }
1870                     else {
1871                         return if grep { $value eq $_ } @{$field->{$addr}};
1872                     }
1873                     push @{$field->{$addr}}, $value;
1874                     return;
1875                 }
1876             }
1877             elsif (substr('constructor', 0, length $access) eq $access) {
1878                 if ($protected) {
1879                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
1880                 }
1881                 else {
1882                     $constructor_fields{$package}{$name} = $field;
1883                 }
1884             }
1885             elsif (substr('readable_array', 0, length $access) eq $access) {
1886
1887                 # Here has read access.  If one of the other parameters for
1888                 # access is array, or this one specifies array (by being more
1889                 # than just 'readable_'), then create a subroutine that
1890                 # assumes the data is an array.  Otherwise just a scalar
1891                 my $subname = "${package}::${protected}$name";
1892                 if (grep { /^a/i } @_
1893                     or length($access) > length('readable_'))
1894                 {
1895                     no strict "refs";
1896                     *$subname = sub {
1897                         use strict "refs";
1898                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1899                         my $addr = do { no overloading; pack 'J', $_[0]; };
1900                         if (ref $field->{$addr} ne 'ARRAY') {
1901                             my $type = ref $field->{$addr};
1902                             $type = 'scalar' unless $type;
1903                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1904                             return;
1905                         }
1906                         return scalar @{$field->{$addr}} unless wantarray;
1907
1908                         # Make a copy; had problems with caller modifying the
1909                         # original otherwise
1910                         my @return = @{$field->{$addr}};
1911                         return @return;
1912                     }
1913                 }
1914                 else {
1915
1916                     # Here not an array value, a simpler function.
1917                     no strict "refs";
1918                     *$subname = sub {
1919                         use strict "refs";
1920                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1921                         no overloading;
1922                         return $field->{pack 'J', $_[0]};
1923                     }
1924                 }
1925             }
1926             elsif (substr('settable', 0, length $access) eq $access) {
1927                 my $subname = "${package}::${protected}set_$name";
1928                 no strict "refs";
1929                 *$subname = sub {
1930                     use strict "refs";
1931                     if (main::DEBUG) {
1932                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1933                         Carp::carp_extra_args(\@_) if @_ > 2;
1934                     }
1935                     # $self is $_[0]; $value is $_[1]
1936                     no overloading;
1937                     $field->{pack 'J', $_[0]} = $_[1];
1938                     return;
1939                 }
1940             }
1941             else {
1942                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
1943             }
1944         }
1945         return;
1946     }
1947 }
1948
1949 package Input_file;
1950
1951 # All input files use this object, which stores various attributes about them,
1952 # and provides for convenient, uniform handling.  The run method wraps the
1953 # processing.  It handles all the bookkeeping of opening, reading, and closing
1954 # the file, returning only significant input lines.
1955 #
1956 # Each object gets a handler which processes the body of the file, and is
1957 # called by run().  Most should use the generic, default handler, which has
1958 # code scrubbed to handle things you might not expect.  A handler should
1959 # basically be a while(next_line()) {...} loop.
1960 #
1961 # You can also set up handlers to
1962 #   1) call before the first line is read for pre processing
1963 #   2) call to adjust each line of the input before the main handler gets them
1964 #   3) call upon EOF before the main handler exits its loop
1965 #   4) call at the end for post processing
1966 #
1967 # $_ is used to store the input line, and is to be filtered by the
1968 # each_line_handler()s.  So, if the format of the line is not in the desired
1969 # format for the main handler, these are used to do that adjusting.  They can
1970 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1971 # so the $_ output of one is used as the input to the next.  None of the other
1972 # handlers are stackable, but could easily be changed to be so.
1973 #
1974 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1975 # which insert the parameters as lines to be processed before the next input
1976 # file line is read.  This allows the EOF handler to flush buffers, for
1977 # example.  The difference between the two routines is that the lines inserted
1978 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
1979 # called it from such a handler, you would get infinite recursion.)  Lines
1980 # inserted by insert_adjusted_lines() go directly to the main handler without
1981 # any adjustments.  If the  post-processing handler calls any of these, there
1982 # will be no effect.  Some error checking for these conditions could be added,
1983 # but it hasn't been done.
1984 #
1985 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1986 # to prevent further processing of the line.  This routine will output the
1987 # message as a warning once, and then keep a count of the lines that have the
1988 # same message, and output that count at the end of the file's processing.
1989 # This keeps the number of messages down to a manageable amount.
1990 #
1991 # get_missings() should be called to retrieve any @missing input lines.
1992 # Messages will be raised if this isn't done if the options aren't to ignore
1993 # missings.
1994
1995 sub trace { return main::trace(@_); }
1996
1997 { # Closure
1998     # Keep track of fields that are to be put into the constructor.
1999     my %constructor_fields;
2000
2001     main::setup_package(Constructor_Fields => \%constructor_fields);
2002
2003     my %file; # Input file name, required
2004     main::set_access('file', \%file, qw{ c r });
2005
2006     my %first_released; # Unicode version file was first released in, required
2007     main::set_access('first_released', \%first_released, qw{ c r });
2008
2009     my %handler;    # Subroutine to process the input file, defaults to
2010                     # 'process_generic_property_file'
2011     main::set_access('handler', \%handler, qw{ c });
2012
2013     my %property;
2014     # name of property this file is for.  defaults to none, meaning not
2015     # applicable, or is otherwise determinable, for example, from each line.
2016     main::set_access('property', \%property, qw{ c });
2017
2018     my %optional;
2019     # If this is true, the file is optional.  If not present, no warning is
2020     # output.  If it is present, the string given by this parameter is
2021     # evaluated, and if false the file is not processed.
2022     main::set_access('optional', \%optional, 'c', 'r');
2023
2024     my %non_skip;
2025     # This is used for debugging, to skip processing of all but a few input
2026     # files.  Add 'non_skip => 1' to the constructor for those files you want
2027     # processed when you set the $debug_skip global.
2028     main::set_access('non_skip', \%non_skip, 'c');
2029
2030     my %skip;
2031     # This is used to skip processing of this input file semi-permanently,
2032     # when it evaluates to true.  The value should be the reason the file is
2033     # being skipped.  It is used for files that we aren't planning to process
2034     # anytime soon, but want to allow to be in the directory and not raise a
2035     # message that we are not handling.  Mostly for test files.  This is in
2036     # contrast to the non_skip element, which is supposed to be used very
2037     # temporarily for debugging.  Sets 'optional' to 1.  Also, files that we
2038     # pretty much will never look at can be placed in the global
2039     # %ignored_files instead.  Ones used here will be added to %skipped files
2040     main::set_access('skip', \%skip, 'c');
2041
2042     my %each_line_handler;
2043     # list of subroutines to look at and filter each non-comment line in the
2044     # file.  defaults to none.  The subroutines are called in order, each is
2045     # to adjust $_ for the next one, and the final one adjusts it for
2046     # 'handler'
2047     main::set_access('each_line_handler', \%each_line_handler, 'c');
2048
2049     my %has_missings_defaults;
2050     # ? Are there lines in the file giving default values for code points
2051     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2052     # the norm, but IGNORED means it has such lines, but the handler doesn't
2053     # use them.  Having these three states allows us to catch changes to the
2054     # UCD that this program should track
2055     main::set_access('has_missings_defaults',
2056                                         \%has_missings_defaults, qw{ c r });
2057
2058     my %pre_handler;
2059     # Subroutine to call before doing anything else in the file.  If undef, no
2060     # such handler is called.
2061     main::set_access('pre_handler', \%pre_handler, qw{ c });
2062
2063     my %eof_handler;
2064     # Subroutine to call upon getting an EOF on the input file, but before
2065     # that is returned to the main handler.  This is to allow buffers to be
2066     # flushed.  The handler is expected to call insert_lines() or
2067     # insert_adjusted() with the buffered material
2068     main::set_access('eof_handler', \%eof_handler, qw{ c r });
2069
2070     my %post_handler;
2071     # Subroutine to call after all the lines of the file are read in and
2072     # processed.  If undef, no such handler is called.
2073     main::set_access('post_handler', \%post_handler, qw{ c });
2074
2075     my %progress_message;
2076     # Message to print to display progress in lieu of the standard one
2077     main::set_access('progress_message', \%progress_message, qw{ c });
2078
2079     my %handle;
2080     # cache open file handle, internal.  Is undef if file hasn't been
2081     # processed at all, empty if has;
2082     main::set_access('handle', \%handle);
2083
2084     my %added_lines;
2085     # cache of lines added virtually to the file, internal
2086     main::set_access('added_lines', \%added_lines);
2087
2088     my %errors;
2089     # cache of errors found, internal
2090     main::set_access('errors', \%errors);
2091
2092     my %missings;
2093     # storage of '@missing' defaults lines
2094     main::set_access('missings', \%missings);
2095
2096     sub new {
2097         my $class = shift;
2098
2099         my $self = bless \do{ my $anonymous_scalar }, $class;
2100         my $addr = do { no overloading; pack 'J', $self; };
2101
2102         # Set defaults
2103         $handler{$addr} = \&main::process_generic_property_file;
2104         $non_skip{$addr} = 0;
2105         $skip{$addr} = 0;
2106         $has_missings_defaults{$addr} = $NO_DEFAULTS;
2107         $handle{$addr} = undef;
2108         $added_lines{$addr} = [ ];
2109         $each_line_handler{$addr} = [ ];
2110         $errors{$addr} = { };
2111         $missings{$addr} = [ ];
2112
2113         # Two positional parameters.
2114         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2115         $file{$addr} = main::internal_file_to_platform(shift);
2116         $first_released{$addr} = shift;
2117
2118         # The rest of the arguments are key => value pairs
2119         # %constructor_fields has been set up earlier to list all possible
2120         # ones.  Either set or push, depending on how the default has been set
2121         # up just above.
2122         my %args = @_;
2123         foreach my $key (keys %args) {
2124             my $argument = $args{$key};
2125
2126             # Note that the fields are the lower case of the constructor keys
2127             my $hash = $constructor_fields{lc $key};
2128             if (! defined $hash) {
2129                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2130                 next;
2131             }
2132             if (ref $hash->{$addr} eq 'ARRAY') {
2133                 if (ref $argument eq 'ARRAY') {
2134                     foreach my $argument (@{$argument}) {
2135                         next if ! defined $argument;
2136                         push @{$hash->{$addr}}, $argument;
2137                     }
2138                 }
2139                 else {
2140                     push @{$hash->{$addr}}, $argument if defined $argument;
2141                 }
2142             }
2143             else {
2144                 $hash->{$addr} = $argument;
2145             }
2146             delete $args{$key};
2147         };
2148
2149         # If the file has a property for it, it means that the property is not
2150         # listed in the file's entries.  So add a handler to the list of line
2151         # handlers to insert the property name into the lines, to provide a
2152         # uniform interface to the final processing subroutine.
2153         # the final code doesn't have to worry about that.
2154         if ($property{$addr}) {
2155             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2156         }
2157
2158         if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2159             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2160         }
2161
2162         # If skipping, set to optional, and add to list of ignored files,
2163         # including its reason
2164         if ($skip{$addr}) {
2165             $optional{$addr} = 1;
2166             $skipped_files{$file{$addr}} = $skip{$addr}
2167         }
2168
2169         return $self;
2170     }
2171
2172
2173     use overload
2174         fallback => 0,
2175         qw("") => "_operator_stringify",
2176         "." => \&main::_operator_dot,
2177     ;
2178
2179     sub _operator_stringify {
2180         my $self = shift;
2181
2182         return __PACKAGE__ . " object for " . $self->file;
2183     }
2184
2185     # flag to make sure extracted files are processed early
2186     my $seen_non_extracted_non_age = 0;
2187
2188     sub run {
2189         # Process the input object $self.  This opens and closes the file and
2190         # calls all the handlers for it.  Currently,  this can only be called
2191         # once per file, as it destroy's the EOF handler
2192
2193         my $self = shift;
2194         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2195
2196         my $addr = do { no overloading; pack 'J', $self; };
2197
2198         my $file = $file{$addr};
2199
2200         # Don't process if not expecting this file (because released later
2201         # than this Unicode version), and isn't there.  This means if someone
2202         # copies it into an earlier version's directory, we will go ahead and
2203         # process it.
2204         return if $first_released{$addr} gt $v_version && ! -e $file;
2205
2206         # If in debugging mode and this file doesn't have the non-skip
2207         # flag set, and isn't one of the critical files, skip it.
2208         if ($debug_skip
2209             && $first_released{$addr} ne v0
2210             && ! $non_skip{$addr})
2211         {
2212             print "Skipping $file in debugging\n" if $verbosity;
2213             return;
2214         }
2215
2216         # File could be optional
2217         if ($optional{$addr}) {
2218             return unless -e $file;
2219             my $result = eval $optional{$addr};
2220             if (! defined $result) {
2221                 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}.  $file Skipped.");
2222                 return;
2223             }
2224             if (! $result) {
2225                 if ($verbosity) {
2226                     print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2227                 }
2228                 return;
2229             }
2230         }
2231
2232         if (! defined $file || ! -e $file) {
2233
2234             # If the file doesn't exist, see if have internal data for it
2235             # (based on first_released being 0).
2236             if ($first_released{$addr} eq v0) {
2237                 $handle{$addr} = 'pretend_is_open';
2238             }
2239             else {
2240                 if (! $optional{$addr}  # File could be optional
2241                     && $v_version ge $first_released{$addr})
2242                 {
2243                     print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2244                 }
2245                 return;
2246             }
2247         }
2248         else {
2249
2250             # Here, the file exists.  Some platforms may change the case of
2251             # its name
2252             if ($seen_non_extracted_non_age) {
2253                 if ($file =~ /$EXTRACTED/i) {
2254                     Carp::my_carp_bug(main::join_lines(<<END
2255 $file should be processed just after the 'Prop...Alias' files, and before
2256 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2257 have subtle problems
2258 END
2259                     ));
2260                 }
2261             }
2262             elsif ($EXTRACTED_DIR
2263                     && $first_released{$addr} ne v0
2264                     && $file !~ /$EXTRACTED/i
2265                     && lc($file) ne 'dage.txt')
2266             {
2267                 # We don't set this (by the 'if' above) if we have no
2268                 # extracted directory, so if running on an early version,
2269                 # this test won't work.  Not worth worrying about.
2270                 $seen_non_extracted_non_age = 1;
2271             }
2272
2273             # And mark the file as having being processed, and warn if it
2274             # isn't a file we are expecting.  As we process the files,
2275             # they are deleted from the hash, so any that remain at the
2276             # end of the program are files that we didn't process.
2277             my $fkey = File::Spec->rel2abs($file);
2278             my $expecting = delete $potential_files{lc($fkey)};
2279
2280             Carp::my_carp("Was not expecting '$file'.") if
2281                     ! $expecting
2282                     && ! defined $handle{$addr};
2283
2284             # Having deleted from expected files, we can quit if not to do
2285             # anything.  Don't print progress unless really want verbosity
2286             if ($skip{$addr}) {
2287                 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2288                 return;
2289             }
2290
2291             # Open the file, converting the slashes used in this program
2292             # into the proper form for the OS
2293             my $file_handle;
2294             if (not open $file_handle, "<", $file) {
2295                 Carp::my_carp("Can't open $file.  Skipping: $!");
2296                 return 0;
2297             }
2298             $handle{$addr} = $file_handle; # Cache the open file handle
2299         }
2300
2301         if ($verbosity >= $PROGRESS) {
2302             if ($progress_message{$addr}) {
2303                 print "$progress_message{$addr}\n";
2304             }
2305             else {
2306                 # If using a virtual file, say so.
2307                 print "Processing ", (-e $file)
2308                                        ? $file
2309                                        : "substitute $file",
2310                                      "\n";
2311             }
2312         }
2313
2314
2315         # Call any special handler for before the file.
2316         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2317
2318         # Then the main handler
2319         &{$handler{$addr}}($self);
2320
2321         # Then any special post-file handler.
2322         &{$post_handler{$addr}}($self) if $post_handler{$addr};
2323
2324         # If any errors have been accumulated, output the counts (as the first
2325         # error message in each class was output when it was encountered).
2326         if ($errors{$addr}) {
2327             my $total = 0;
2328             my $types = 0;
2329             foreach my $error (keys %{$errors{$addr}}) {
2330                 $total += $errors{$addr}->{$error};
2331                 delete $errors{$addr}->{$error};
2332                 $types++;
2333             }
2334             if ($total > 1) {
2335                 my $message
2336                         = "A total of $total lines had errors in $file.  ";
2337
2338                 $message .= ($types == 1)
2339                             ? '(Only the first one was displayed.)'
2340                             : '(Only the first of each type was displayed.)';
2341                 Carp::my_carp($message);
2342             }
2343         }
2344
2345         if (@{$missings{$addr}}) {
2346             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2347         }
2348
2349         # If a real file handle, close it.
2350         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2351                                                         ref $handle{$addr};
2352         $handle{$addr} = "";   # Uses empty to indicate that has already seen
2353                                # the file, as opposed to undef
2354         return;
2355     }
2356
2357     sub next_line {
2358         # Sets $_ to be the next logical input line, if any.  Returns non-zero
2359         # if such a line exists.  'logical' means that any lines that have
2360         # been added via insert_lines() will be returned in $_ before the file
2361         # is read again.
2362
2363         my $self = shift;
2364         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2365
2366         my $addr = do { no overloading; pack 'J', $self; };
2367
2368         # Here the file is open (or if the handle is not a ref, is an open
2369         # 'virtual' file).  Get the next line; any inserted lines get priority
2370         # over the file itself.
2371         my $adjusted;
2372
2373         LINE:
2374         while (1) { # Loop until find non-comment, non-empty line
2375             #local $to_trace = 1 if main::DEBUG;
2376             my $inserted_ref = shift @{$added_lines{$addr}};
2377             if (defined $inserted_ref) {
2378                 ($adjusted, $_) = @{$inserted_ref};
2379                 trace $adjusted, $_ if main::DEBUG && $to_trace;
2380                 return 1 if $adjusted;
2381             }
2382             else {
2383                 last if ! ref $handle{$addr}; # Don't read unless is real file
2384                 last if ! defined ($_ = readline $handle{$addr});
2385             }
2386             chomp;
2387             trace $_ if main::DEBUG && $to_trace;
2388
2389             # See if this line is the comment line that defines what property
2390             # value that code points that are not listed in the file should
2391             # have.  The format or existence of these lines is not guaranteed
2392             # by Unicode since they are comments, but the documentation says
2393             # that this was added for machine-readability, so probably won't
2394             # change.  This works starting in Unicode Version 5.0.  They look
2395             # like:
2396             #
2397             # @missing: 0000..10FFFF; Not_Reordered
2398             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2399             # @missing: 0000..10FFFF; ; NaN
2400             #
2401             # Save the line for a later get_missings() call.
2402             if (/$missing_defaults_prefix/) {
2403                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2404                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2405                 }
2406                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2407                     my @defaults = split /\s* ; \s*/x, $_;
2408
2409                     # The first field is the @missing, which ends in a
2410                     # semi-colon, so can safely shift.
2411                     shift @defaults;
2412
2413                     # Some of these lines may have empty field placeholders
2414                     # which get in the way.  An example is:
2415                     # @missing: 0000..10FFFF; ; NaN
2416                     # Remove them.  Process starting from the top so the
2417                     # splice doesn't affect things still to be looked at.
2418                     for (my $i = @defaults - 1; $i >= 0; $i--) {
2419                         next if $defaults[$i] ne "";
2420                         splice @defaults, $i, 1;
2421                     }
2422
2423                     # What's left should be just the property (maybe) and the
2424                     # default.  Having only one element means it doesn't have
2425                     # the property.
2426                     my $default;
2427                     my $property;
2428                     if (@defaults >= 1) {
2429                         if (@defaults == 1) {
2430                             $default = $defaults[0];
2431                         }
2432                         else {
2433                             $property = $defaults[0];
2434                             $default = $defaults[1];
2435                         }
2436                     }
2437
2438                     if (@defaults < 1
2439                         || @defaults > 2
2440                         || ($default =~ /^</
2441                             && $default !~ /^<code *point>$/i
2442                             && $default !~ /^<none>$/i
2443                             && $default !~ /^<script>$/i))
2444                     {
2445                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2446                     }
2447                     else {
2448
2449                         # If the property is missing from the line, it should
2450                         # be the one for the whole file
2451                         $property = $property{$addr} if ! defined $property;
2452
2453                         # Change <none> to the null string, which is what it
2454                         # really means.  If the default is the code point
2455                         # itself, set it to <code point>, which is what
2456                         # Unicode uses (but sometimes they've forgotten the
2457                         # space)
2458                         if ($default =~ /^<none>$/i) {
2459                             $default = "";
2460                         }
2461                         elsif ($default =~ /^<code *point>$/i) {
2462                             $default = $CODE_POINT;
2463                         }
2464                         elsif ($default =~ /^<script>$/i) {
2465
2466                             # Special case this one.  Currently is from
2467                             # ScriptExtensions.txt, and means for all unlisted
2468                             # code points, use their Script property values.
2469                             # For the code points not listed in that file, the
2470                             # default value is 'Unknown'.
2471                             $default = "Unknown";
2472                         }
2473
2474                         # Store them as a sub-arrays with both components.
2475                         push @{$missings{$addr}}, [ $default, $property ];
2476                     }
2477                 }
2478
2479                 # There is nothing for the caller to process on this comment
2480                 # line.
2481                 next;
2482             }
2483
2484             # Remove comments and trailing space, and skip this line if the
2485             # result is empty
2486             s/#.*//;
2487             s/\s+$//;
2488             next if /^$/;
2489
2490             # Call any handlers for this line, and skip further processing of
2491             # the line if the handler sets the line to null.
2492             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2493                 &{$sub_ref}($self);
2494                 next LINE if /^$/;
2495             }
2496
2497             # Here the line is ok.  return success.
2498             return 1;
2499         } # End of looping through lines.
2500
2501         # If there is an EOF handler, call it (only once) and if it generates
2502         # more lines to process go back in the loop to handle them.
2503         if ($eof_handler{$addr}) {
2504             &{$eof_handler{$addr}}($self);
2505             $eof_handler{$addr} = "";   # Currently only get one shot at it.
2506             goto LINE if $added_lines{$addr};
2507         }
2508
2509         # Return failure -- no more lines.
2510         return 0;
2511
2512     }
2513
2514 #   Not currently used, not fully tested.
2515 #    sub peek {
2516 #        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2517 #        # record.  Not callable from an each_line_handler(), nor does it call
2518 #        # an each_line_handler() on the line.
2519 #
2520 #        my $self = shift;
2521 #        my $addr = do { no overloading; pack 'J', $self; };
2522 #
2523 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
2524 #            my ($adjusted, $line) = @{$inserted_ref};
2525 #            next if $adjusted;
2526 #
2527 #            # Remove comments and trailing space, and return a non-empty
2528 #            # resulting line
2529 #            $line =~ s/#.*//;
2530 #            $line =~ s/\s+$//;
2531 #            return $line if $line ne "";
2532 #        }
2533 #
2534 #        return if ! ref $handle{$addr}; # Don't read unless is real file
2535 #        while (1) { # Loop until find non-comment, non-empty line
2536 #            local $to_trace = 1 if main::DEBUG;
2537 #            trace $_ if main::DEBUG && $to_trace;
2538 #            return if ! defined (my $line = readline $handle{$addr});
2539 #            chomp $line;
2540 #            push @{$added_lines{$addr}}, [ 0, $line ];
2541 #
2542 #            $line =~ s/#.*//;
2543 #            $line =~ s/\s+$//;
2544 #            return $line if $line ne "";
2545 #        }
2546 #
2547 #        return;
2548 #    }
2549
2550
2551     sub insert_lines {
2552         # Lines can be inserted so that it looks like they were in the input
2553         # file at the place it was when this routine is called.  See also
2554         # insert_adjusted_lines().  Lines inserted via this routine go through
2555         # any each_line_handler()
2556
2557         my $self = shift;
2558
2559         # Each inserted line is an array, with the first element being 0 to
2560         # indicate that this line hasn't been adjusted, and needs to be
2561         # processed.
2562         no overloading;
2563         push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2564         return;
2565     }
2566
2567     sub insert_adjusted_lines {
2568         # Lines can be inserted so that it looks like they were in the input
2569         # file at the place it was when this routine is called.  See also
2570         # insert_lines().  Lines inserted via this routine are already fully
2571         # adjusted, ready to be processed; each_line_handler()s handlers will
2572         # not be called.  This means this is not a completely general
2573         # facility, as only the last each_line_handler on the stack should
2574         # call this.  It could be made more general, by passing to each of the
2575         # line_handlers their position on the stack, which they would pass on
2576         # to this routine, and that would replace the boolean first element in
2577         # the anonymous array pushed here, so that the next_line routine could
2578         # use that to call only those handlers whose index is after it on the
2579         # stack.  But this is overkill for what is needed now.
2580
2581         my $self = shift;
2582         trace $_[0] if main::DEBUG && $to_trace;
2583
2584         # Each inserted line is an array, with the first element being 1 to
2585         # indicate that this line has been adjusted
2586         no overloading;
2587         push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2588         return;
2589     }
2590
2591     sub get_missings {
2592         # Returns the stored up @missings lines' values, and clears the list.
2593         # The values are in an array, consisting of the default in the first
2594         # element, and the property in the 2nd.  However, since these lines
2595         # can be stacked up, the return is an array of all these arrays.
2596
2597         my $self = shift;
2598         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2599
2600         my $addr = do { no overloading; pack 'J', $self; };
2601
2602         # If not accepting a list return, just return the first one.
2603         return shift @{$missings{$addr}} unless wantarray;
2604
2605         my @return = @{$missings{$addr}};
2606         undef @{$missings{$addr}};
2607         return @return;
2608     }
2609
2610     sub _insert_property_into_line {
2611         # Add a property field to $_, if this file requires it.
2612
2613         my $self = shift;
2614         my $addr = do { no overloading; pack 'J', $self; };
2615         my $property = $property{$addr};
2616         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2617
2618         $_ =~ s/(;|$)/; $property$1/;
2619         return;
2620     }
2621
2622     sub carp_bad_line {
2623         # Output consistent error messages, using either a generic one, or the
2624         # one given by the optional parameter.  To avoid gazillions of the
2625         # same message in case the syntax of a  file is way off, this routine
2626         # only outputs the first instance of each message, incrementing a
2627         # count so the totals can be output at the end of the file.
2628
2629         my $self = shift;
2630         my $message = shift;
2631         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2632
2633         my $addr = do { no overloading; pack 'J', $self; };
2634
2635         $message = 'Unexpected line' unless $message;
2636
2637         # No trailing punctuation so as to fit with our addenda.
2638         $message =~ s/[.:;,]$//;
2639
2640         # If haven't seen this exact message before, output it now.  Otherwise
2641         # increment the count of how many times it has occurred
2642         unless ($errors{$addr}->{$message}) {
2643             Carp::my_carp("$message in '$_' in "
2644                             . $file{$addr}
2645                             . " at line $..  Skipping this line;");
2646             $errors{$addr}->{$message} = 1;
2647         }
2648         else {
2649             $errors{$addr}->{$message}++;
2650         }
2651
2652         # Clear the line to prevent any further (meaningful) processing of it.
2653         $_ = "";
2654
2655         return;
2656     }
2657 } # End closure
2658
2659 package Multi_Default;
2660
2661 # Certain properties in early versions of Unicode had more than one possible
2662 # default for code points missing from the files.  In these cases, one
2663 # default applies to everything left over after all the others are applied,
2664 # and for each of the others, there is a description of which class of code
2665 # points applies to it.  This object helps implement this by storing the
2666 # defaults, and for all but that final default, an eval string that generates
2667 # the class that it applies to.
2668
2669
2670 {   # Closure
2671
2672     main::setup_package();
2673
2674     my %class_defaults;
2675     # The defaults structure for the classes
2676     main::set_access('class_defaults', \%class_defaults);
2677
2678     my %other_default;
2679     # The default that applies to everything left over.
2680     main::set_access('other_default', \%other_default, 'r');
2681
2682
2683     sub new {
2684         # The constructor is called with default => eval pairs, terminated by
2685         # the left-over default. e.g.
2686         # Multi_Default->new(
2687         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2688         #               -  0x200D',
2689         #        'R' => 'some other expression that evaluates to code points',
2690         #        .
2691         #        .
2692         #        .
2693         #        'U'));
2694
2695         my $class = shift;
2696
2697         my $self = bless \do{my $anonymous_scalar}, $class;
2698         my $addr = do { no overloading; pack 'J', $self; };
2699
2700         while (@_ > 1) {
2701             my $default = shift;
2702             my $eval = shift;
2703             $class_defaults{$addr}->{$default} = $eval;
2704         }
2705
2706         $other_default{$addr} = shift;
2707
2708         return $self;
2709     }
2710
2711     sub get_next_defaults {
2712         # Iterates and returns the next class of defaults.
2713         my $self = shift;
2714         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2715
2716         my $addr = do { no overloading; pack 'J', $self; };
2717
2718         return each %{$class_defaults{$addr}};
2719     }
2720 }
2721
2722 package Alias;
2723
2724 # An alias is one of the names that a table goes by.  This class defines them
2725 # including some attributes.  Everything is currently setup in the
2726 # constructor.
2727
2728
2729 {   # Closure
2730
2731     main::setup_package();
2732
2733     my %name;
2734     main::set_access('name', \%name, 'r');
2735
2736     my %loose_match;
2737     # Should this name match loosely or not.
2738     main::set_access('loose_match', \%loose_match, 'r');
2739
2740     my %make_re_pod_entry;
2741     # Some aliases should not get their own entries in the re section of the
2742     # pod, because they are covered by a wild-card, and some we want to
2743     # discourage use of.  Binary
2744     main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
2745
2746     my %ucd;
2747     # Is this documented to be accessible via Unicode::UCD
2748     main::set_access('ucd', \%ucd, 'r', 's');
2749
2750     my %status;
2751     # Aliases have a status, like deprecated, or even suppressed (which means
2752     # they don't appear in documentation).  Enum
2753     main::set_access('status', \%status, 'r');
2754
2755     my %ok_as_filename;
2756     # Similarly, some aliases should not be considered as usable ones for
2757     # external use, such as file names, or we don't want documentation to
2758     # recommend them.  Boolean
2759     main::set_access('ok_as_filename', \%ok_as_filename, 'r');
2760
2761     sub new {
2762         my $class = shift;
2763
2764         my $self = bless \do { my $anonymous_scalar }, $class;
2765         my $addr = do { no overloading; pack 'J', $self; };
2766
2767         $name{$addr} = shift;
2768         $loose_match{$addr} = shift;
2769         $make_re_pod_entry{$addr} = shift;
2770         $ok_as_filename{$addr} = shift;
2771         $status{$addr} = shift;
2772         $ucd{$addr} = shift;
2773
2774         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2775
2776         # Null names are never ok externally
2777         $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
2778
2779         return $self;
2780     }
2781 }
2782
2783 package Range;
2784
2785 # A range is the basic unit for storing code points, and is described in the
2786 # comments at the beginning of the program.  Each range has a starting code
2787 # point; an ending code point (not less than the starting one); a value
2788 # that applies to every code point in between the two end-points, inclusive;
2789 # and an enum type that applies to the value.  The type is for the user's
2790 # convenience, and has no meaning here, except that a non-zero type is
2791 # considered to not obey the normal Unicode rules for having standard forms.
2792 #
2793 # The same structure is used for both map and match tables, even though in the
2794 # latter, the value (and hence type) is irrelevant and could be used as a
2795 # comment.  In map tables, the value is what all the code points in the range
2796 # map to.  Type 0 values have the standardized version of the value stored as
2797 # well, so as to not have to recalculate it a lot.
2798
2799 sub trace { return main::trace(@_); }
2800
2801 {   # Closure
2802
2803     main::setup_package();
2804
2805     my %start;
2806     main::set_access('start', \%start, 'r', 's');
2807
2808     my %end;
2809     main::set_access('end', \%end, 'r', 's');
2810
2811     my %value;
2812     main::set_access('value', \%value, 'r');
2813
2814     my %type;
2815     main::set_access('type', \%type, 'r');
2816
2817     my %standard_form;
2818     # The value in internal standard form.  Defined only if the type is 0.
2819     main::set_access('standard_form', \%standard_form);
2820
2821     # Note that if these fields change, the dump() method should as well
2822
2823     sub new {
2824         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2825         my $class = shift;
2826
2827         my $self = bless \do { my $anonymous_scalar }, $class;
2828         my $addr = do { no overloading; pack 'J', $self; };
2829
2830         $start{$addr} = shift;
2831         $end{$addr} = shift;
2832
2833         my %args = @_;
2834
2835         my $value = delete $args{'Value'};  # Can be 0
2836         $value = "" unless defined $value;
2837         $value{$addr} = $value;
2838
2839         $type{$addr} = delete $args{'Type'} || 0;
2840
2841         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2842
2843         if (! $type{$addr}) {
2844             $standard_form{$addr} = main::standardize($value);
2845         }
2846
2847         return $self;
2848     }
2849
2850     use overload
2851         fallback => 0,
2852         qw("") => "_operator_stringify",
2853         "." => \&main::_operator_dot,
2854     ;
2855
2856     sub _operator_stringify {
2857         my $self = shift;
2858         my $addr = do { no overloading; pack 'J', $self; };
2859
2860         # Output it like '0041..0065 (value)'
2861         my $return = sprintf("%04X", $start{$addr})
2862                         .  '..'
2863                         . sprintf("%04X", $end{$addr});
2864         my $value = $value{$addr};
2865         my $type = $type{$addr};
2866         $return .= ' (';
2867         $return .= "$value";
2868         $return .= ", Type=$type" if $type != 0;
2869         $return .= ')';
2870
2871         return $return;
2872     }
2873
2874     sub standard_form {
2875         # The standard form is the value itself if the standard form is
2876         # undefined (that is if the value is special)
2877
2878         my $self = shift;
2879         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2880
2881         my $addr = do { no overloading; pack 'J', $self; };
2882
2883         return $standard_form{$addr} if defined $standard_form{$addr};
2884         return $value{$addr};
2885     }
2886
2887     sub dump {
2888         # Human, not machine readable.  For machine readable, comment out this
2889         # entire routine and let the standard one take effect.
2890         my $self = shift;
2891         my $indent = shift;
2892         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2893
2894         my $addr = do { no overloading; pack 'J', $self; };
2895
2896         my $return = $indent
2897                     . sprintf("%04X", $start{$addr})
2898                     . '..'
2899                     . sprintf("%04X", $end{$addr})
2900                     . " '$value{$addr}';";
2901         if (! defined $standard_form{$addr}) {
2902             $return .= "(type=$type{$addr})";
2903         }
2904         elsif ($standard_form{$addr} ne $value{$addr}) {
2905             $return .= "(standard '$standard_form{$addr}')";
2906         }
2907         return $return;
2908     }
2909 } # End closure
2910
2911 package _Range_List_Base;
2912
2913 # Base class for range lists.  A range list is simply an ordered list of
2914 # ranges, so that the ranges with the lowest starting numbers are first in it.
2915 #
2916 # When a new range is added that is adjacent to an existing range that has the
2917 # same value and type, it merges with it to form a larger range.
2918 #
2919 # Ranges generally do not overlap, except that there can be multiple entries
2920 # of single code point ranges.  This is because of NameAliases.txt.
2921 #
2922 # In this program, there is a standard value such that if two different
2923 # values, have the same standard value, they are considered equivalent.  This
2924 # value was chosen so that it gives correct results on Unicode data
2925
2926 # There are a number of methods to manipulate range lists, and some operators
2927 # are overloaded to handle them.
2928
2929 sub trace { return main::trace(@_); }
2930
2931 { # Closure
2932
2933     our $addr;
2934
2935     main::setup_package();
2936
2937     my %ranges;
2938     # The list of ranges
2939     main::set_access('ranges', \%ranges, 'readable_array');
2940
2941     my %max;
2942     # The highest code point in the list.  This was originally a method, but
2943     # actual measurements said it was used a lot.
2944     main::set_access('max', \%max, 'r');
2945
2946     my %each_range_iterator;
2947     # Iterator position for each_range()
2948     main::set_access('each_range_iterator', \%each_range_iterator);
2949
2950     my %owner_name_of;
2951     # Name of parent this is attached to, if any.  Solely for better error
2952     # messages.
2953     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2954
2955     my %_search_ranges_cache;
2956     # A cache of the previous result from _search_ranges(), for better
2957     # performance
2958     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2959
2960     sub new {
2961         my $class = shift;
2962         my %args = @_;
2963
2964         # Optional initialization data for the range list.
2965         my $initialize = delete $args{'Initialize'};
2966
2967         my $self;
2968
2969         # Use _union() to initialize.  _union() returns an object of this
2970         # class, which means that it will call this constructor recursively.
2971         # But it won't have this $initialize parameter so that it won't
2972         # infinitely loop on this.
2973         return _union($class, $initialize, %args) if defined $initialize;
2974
2975         $self = bless \do { my $anonymous_scalar }, $class;
2976         my $addr = do { no overloading; pack 'J', $self; };
2977
2978         # Optional parent object, only for debug info.
2979         $owner_name_of{$addr} = delete $args{'Owner'};
2980         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2981
2982         # Stringify, in case it is an object.
2983         $owner_name_of{$addr} = "$owner_name_of{$addr}";
2984
2985         # This is used only for error messages, and so a colon is added
2986         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2987
2988         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2989
2990         # Max is initialized to a negative value that isn't adjacent to 0,
2991         # for simpler tests
2992         $max{$addr} = -2;
2993
2994         $_search_ranges_cache{$addr} = 0;
2995         $ranges{$addr} = [];
2996
2997         return $self;
2998     }
2999
3000     use overload
3001         fallback => 0,
3002         qw("") => "_operator_stringify",
3003         "." => \&main::_operator_dot,
3004     ;
3005
3006     sub _operator_stringify {
3007         my $self = shift;
3008         my $addr = do { no overloading; pack 'J', $self; };
3009
3010         return "Range_List attached to '$owner_name_of{$addr}'"
3011                                                 if $owner_name_of{$addr};
3012         return "anonymous Range_List " . \$self;
3013     }
3014
3015     sub _union {
3016         # Returns the union of the input code points.  It can be called as
3017         # either a constructor or a method.  If called as a method, the result
3018         # will be a new() instance of the calling object, containing the union
3019         # of that object with the other parameter's code points;  if called as
3020         # a constructor, the first parameter gives the class that the new object
3021         # should be, and the second parameter gives the code points to go into
3022         # it.
3023         # In either case, there are two parameters looked at by this routine;
3024         # any additional parameters are passed to the new() constructor.
3025         #
3026         # The code points can come in the form of some object that contains
3027         # ranges, and has a conventionally named method to access them; or
3028         # they can be an array of individual code points (as integers); or
3029         # just a single code point.
3030         #
3031         # If they are ranges, this routine doesn't make any effort to preserve
3032         # the range values and types of one input over the other.  Therefore
3033         # this base class should not allow _union to be called from other than
3034         # initialization code, so as to prevent two tables from being added
3035         # together where the range values matter.  The general form of this
3036         # routine therefore belongs in a derived class, but it was moved here
3037         # to avoid duplication of code.  The failure to overload this in this
3038         # class keeps it safe.
3039         #
3040         # It does make the effort during initialization to accept tables with
3041         # multiple values for the same code point, and to preserve the order
3042         # of these.  If there is only one input range or range set, it doesn't
3043         # sort (as it should already be sorted to the desired order), and will
3044         # accept multiple values per code point.  Otherwise it will merge
3045         # multiple values into a single one.
3046
3047         my $self;
3048         my @args;   # Arguments to pass to the constructor
3049
3050         my $class = shift;
3051
3052         # If a method call, will start the union with the object itself, and
3053         # the class of the new object will be the same as self.
3054         if (ref $class) {
3055             $self = $class;
3056             $class = ref $self;
3057             push @args, $self;
3058         }
3059
3060         # Add the other required parameter.
3061         push @args, shift;
3062         # Rest of parameters are passed on to the constructor
3063
3064         # Accumulate all records from both lists.
3065         my @records;
3066         my $input_count = 0;
3067         for my $arg (@args) {
3068             #local $to_trace = 0 if main::DEBUG;
3069             trace "argument = $arg" if main::DEBUG && $to_trace;
3070             if (! defined $arg) {
3071                 my $message = "";
3072                 if (defined $self) {
3073                     no overloading;
3074                     $message .= $owner_name_of{pack 'J', $self};
3075                 }
3076                 Carp::my_carp_bug($message .= "Undefined argument to _union.  No union done.");
3077                 return;
3078             }
3079
3080             $arg = [ $arg ] if ! ref $arg;
3081             my $type = ref $arg;
3082             if ($type eq 'ARRAY') {
3083                 foreach my $element (@$arg) {
3084                     push @records, Range->new($element, $element);
3085                     $input_count++;
3086                 }
3087             }
3088             elsif ($arg->isa('Range')) {
3089                 push @records, $arg;
3090                 $input_count++;
3091             }
3092             elsif ($arg->can('ranges')) {
3093                 push @records, $arg->ranges;
3094                 $input_count++;
3095             }
3096             else {
3097                 my $message = "";
3098                 if (defined $self) {
3099                     no overloading;
3100                     $message .= $owner_name_of{pack 'J', $self};
3101                 }
3102                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3103                 return;
3104             }
3105         }
3106
3107         # Sort with the range containing the lowest ordinal first, but if
3108         # two ranges start at the same code point, sort with the bigger range
3109         # of the two first, because it takes fewer cycles.
3110         if ($input_count > 1) {
3111             @records = sort { ($a->start <=> $b->start)
3112                                       or
3113                                     # if b is shorter than a, b->end will be
3114                                     # less than a->end, and we want to select
3115                                     # a, so want to return -1
3116                                     ($b->end <=> $a->end)
3117                                    } @records;
3118         }
3119
3120         my $new = $class->new(@_);
3121
3122         # Fold in records so long as they add new information.
3123         for my $set (@records) {
3124             my $start = $set->start;
3125             my $end   = $set->end;
3126             my $value = $set->value;
3127             my $type  = $set->type;
3128             if ($start > $new->max) {
3129                 $new->_add_delete('+', $start, $end, $value, Type => $type);
3130             }
3131             elsif ($end > $new->max) {
3132                 $new->_add_delete('+', $new->max +1, $end, $value,
3133                                                                 Type => $type);
3134             }
3135             elsif ($input_count == 1) {
3136                 # Here, overlaps existing range, but is from a single input,
3137                 # so preserve the multiple values from that input.
3138                 $new->_add_delete('+', $start, $end, $value, Type => $type,
3139                                                 Replace => $MULTIPLE_AFTER);
3140             }
3141         }
3142
3143         return $new;
3144     }
3145
3146     sub range_count {        # Return the number of ranges in the range list
3147         my $self = shift;
3148         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3149
3150         no overloading;
3151         return scalar @{$ranges{pack 'J', $self}};
3152     }
3153
3154     sub min {
3155         # Returns the minimum code point currently in the range list, or if
3156         # the range list is empty, 2 beyond the max possible.  This is a
3157         # method because used so rarely, that not worth saving between calls,
3158         # and having to worry about changing it as ranges are added and
3159         # deleted.
3160
3161         my $self = shift;
3162         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3163
3164         my $addr = do { no overloading; pack 'J', $self; };
3165
3166         # If the range list is empty, return a large value that isn't adjacent
3167         # to any that could be in the range list, for simpler tests
3168         return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3169         return $ranges{$addr}->[0]->start;
3170     }
3171
3172     sub contains {
3173         # Boolean: Is argument in the range list?  If so returns $i such that:
3174         #   range[$i]->end < $codepoint <= range[$i+1]->end
3175         # which is one beyond what you want; this is so that the 0th range
3176         # doesn't return false
3177         my $self = shift;
3178         my $codepoint = shift;
3179         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3180
3181         my $i = $self->_search_ranges($codepoint);
3182         return 0 unless defined $i;
3183
3184         # The search returns $i, such that
3185         #   range[$i-1]->end < $codepoint <= range[$i]->end
3186         # So is in the table if and only iff it is at least the start position
3187         # of range $i.
3188         no overloading;
3189         return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3190         return $i + 1;
3191     }
3192
3193     sub containing_range {
3194         # Returns the range object that contains the code point, undef if none
3195
3196         my $self = shift;
3197         my $codepoint = shift;
3198         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3199
3200         my $i = $self->contains($codepoint);
3201         return unless $i;
3202
3203         # contains() returns 1 beyond where we should look
3204         no overloading;
3205         return $ranges{pack 'J', $self}->[$i-1];
3206     }
3207
3208     sub value_of {
3209         # Returns the value associated with the code point, undef if none
3210
3211         my $self = shift;
3212         my $codepoint = shift;
3213         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3214
3215         my $range = $self->containing_range($codepoint);
3216         return unless defined $range;
3217
3218         return $range->value;
3219     }
3220
3221     sub type_of {
3222         # Returns the type of the range containing the code point, undef if
3223         # the code point is not in the table
3224
3225         my $self = shift;
3226         my $codepoint = shift;
3227         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3228
3229         my $range = $self->containing_range($codepoint);
3230         return unless defined $range;
3231
3232         return $range->type;
3233     }
3234
3235     sub _search_ranges {
3236         # Find the range in the list which contains a code point, or where it
3237         # should go if were to add it.  That is, it returns $i, such that:
3238         #   range[$i-1]->end < $codepoint <= range[$i]->end
3239         # Returns undef if no such $i is possible (e.g. at end of table), or
3240         # if there is an error.
3241
3242         my $self = shift;
3243         my $code_point = shift;
3244         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3245
3246         my $addr = do { no overloading; pack 'J', $self; };
3247
3248         return if $code_point > $max{$addr};
3249         my $r = $ranges{$addr};                # The current list of ranges
3250         my $range_list_size = scalar @$r;
3251         my $i;
3252
3253         use integer;        # want integer division
3254
3255         # Use the cached result as the starting guess for this one, because,
3256         # an experiment on 5.1 showed that 90% of the time the cache was the
3257         # same as the result on the next call (and 7% it was one less).
3258         $i = $_search_ranges_cache{$addr};
3259         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3260                                             # from an intervening deletion
3261         #local $to_trace = 1 if main::DEBUG;
3262         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3263         return $i if $code_point <= $r->[$i]->end
3264                      && ($i == 0 || $r->[$i-1]->end < $code_point);
3265
3266         # Here the cache doesn't yield the correct $i.  Try adding 1.
3267         if ($i < $range_list_size - 1
3268             && $r->[$i]->end < $code_point &&
3269             $code_point <= $r->[$i+1]->end)
3270         {
3271             $i++;
3272             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3273             $_search_ranges_cache{$addr} = $i;
3274             return $i;
3275         }
3276
3277         # Here, adding 1 also didn't work.  We do a binary search to
3278         # find the correct position, starting with current $i
3279         my $lower = 0;
3280         my $upper = $range_list_size - 1;
3281         while (1) {
3282             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3283
3284             if ($code_point <= $r->[$i]->end) {
3285
3286                 # Here we have met the upper constraint.  We can quit if we
3287                 # also meet the lower one.
3288                 last if $i == 0 || $r->[$i-1]->end < $code_point;
3289
3290                 $upper = $i;        # Still too high.
3291
3292             }
3293             else {
3294
3295                 # Here, $r[$i]->end < $code_point, so look higher up.
3296                 $lower = $i;
3297             }
3298
3299             # Split search domain in half to try again.
3300             my $temp = ($upper + $lower) / 2;
3301
3302             # No point in continuing unless $i changes for next time
3303             # in the loop.
3304             if ($temp == $i) {
3305
3306                 # We can't reach the highest element because of the averaging.
3307                 # So if one below the upper edge, force it there and try one
3308                 # more time.
3309                 if ($i == $range_list_size - 2) {
3310
3311                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3312                     $i = $range_list_size - 1;
3313
3314                     # Change $lower as well so if fails next time through,
3315                     # taking the average will yield the same $i, and we will
3316                     # quit with the error message just below.
3317                     $lower = $i;
3318                     next;
3319                 }
3320                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
3321                 return;
3322             }
3323             $i = $temp;
3324         } # End of while loop
3325
3326         if (main::DEBUG && $to_trace) {
3327             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3328             trace "i=  [ $i ]", $r->[$i];
3329             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3330         }
3331
3332         # Here we have found the offset.  Cache it as a starting point for the
3333         # next call.
3334         $_search_ranges_cache{$addr} = $i;
3335         return $i;
3336     }
3337
3338     sub _add_delete {
3339         # Add, replace or delete ranges to or from a list.  The $type
3340         # parameter gives which:
3341         #   '+' => insert or replace a range, returning a list of any changed
3342         #          ranges.
3343         #   '-' => delete a range, returning a list of any deleted ranges.
3344         #
3345         # The next three parameters give respectively the start, end, and
3346         # value associated with the range.  'value' should be null unless the
3347         # operation is '+';
3348         #
3349         # The range list is kept sorted so that the range with the lowest
3350         # starting position is first in the list, and generally, adjacent
3351         # ranges with the same values are merged into a single larger one (see
3352         # exceptions below).
3353         #
3354         # There are more parameters; all are key => value pairs:
3355         #   Type    gives the type of the value.  It is only valid for '+'.
3356         #           All ranges have types; if this parameter is omitted, 0 is
3357         #           assumed.  Ranges with type 0 are assumed to obey the
3358         #           Unicode rules for casing, etc; ranges with other types are
3359         #           not.  Otherwise, the type is arbitrary, for the caller's
3360         #           convenience, and looked at only by this routine to keep
3361         #           adjacent ranges of different types from being merged into
3362         #           a single larger range, and when Replace =>
3363         #           $IF_NOT_EQUIVALENT is specified (see just below).
3364         #   Replace  determines what to do if the range list already contains
3365         #            ranges which coincide with all or portions of the input
3366         #            range.  It is only valid for '+':
3367         #       => $NO            means that the new value is not to replace
3368         #                         any existing ones, but any empty gaps of the
3369         #                         range list coinciding with the input range
3370         #                         will be filled in with the new value.
3371         #       => $UNCONDITIONALLY  means to replace the existing values with
3372         #                         this one unconditionally.  However, if the
3373         #                         new and old values are identical, the
3374         #                         replacement is skipped to save cycles
3375         #       => $IF_NOT_EQUIVALENT means to replace the existing values
3376         #          (the default)  with this one if they are not equivalent.
3377         #                         Ranges are equivalent if their types are the
3378         #                         same, and they are the same string; or if
3379         #                         both are type 0 ranges, if their Unicode
3380         #                         standard forms are identical.  In this last
3381         #                         case, the routine chooses the more "modern"
3382         #                         one to use.  This is because some of the
3383         #                         older files are formatted with values that
3384         #                         are, for example, ALL CAPs, whereas the
3385         #                         derived files have a more modern style,
3386         #                         which looks better.  By looking for this
3387         #                         style when the pre-existing and replacement
3388         #                         standard forms are the same, we can move to
3389         #                         the modern style
3390         #       => $MULTIPLE_BEFORE means that if this range duplicates an
3391         #                         existing one, but has a different value,
3392         #                         don't replace the existing one, but insert
3393         #                         this, one so that the same range can occur
3394         #                         multiple times.  They are stored LIFO, so
3395         #                         that the final one inserted is the first one
3396         #                         returned in an ordered search of the table.
3397         #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3398         #                         FIFO, so that this one is inserted after all
3399         #                         others that currently exist.
3400         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
3401         #
3402         # "same value" means identical for non-type-0 ranges, and it means
3403         # having the same standard forms for type-0 ranges.
3404
3405         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3406
3407         my $self = shift;
3408         my $operation = shift;   # '+' for add/replace; '-' for delete;
3409         my $start = shift;
3410         my $end   = shift;
3411         my $value = shift;
3412
3413         my %args = @_;
3414
3415         $value = "" if not defined $value;        # warning: $value can be "0"
3416
3417         my $replace = delete $args{'Replace'};
3418         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3419
3420         my $type = delete $args{'Type'};
3421         $type = 0 unless defined $type;
3422
3423         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3424
3425         my $addr = do { no overloading; pack 'J', $self; };
3426
3427         if ($operation ne '+' && $operation ne '-') {
3428             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
3429             return;
3430         }
3431         unless (defined $start && defined $end) {
3432             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
3433             return;
3434         }
3435         unless ($end >= $start) {
3436             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
3437             return;
3438         }
3439         #local $to_trace = 1 if main::DEBUG;
3440
3441         if ($operation eq '-') {
3442             if ($replace != $IF_NOT_EQUIVALENT) {
3443                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
3444                 $replace = $IF_NOT_EQUIVALENT;
3445             }
3446             if ($type) {
3447                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
3448                 $type = 0;
3449             }
3450             if ($value ne "") {
3451                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
3452                 $value = "";
3453             }
3454         }
3455
3456         my $r = $ranges{$addr};               # The current list of ranges
3457         my $range_list_size = scalar @$r;     # And its size
3458         my $max = $max{$addr};                # The current high code point in
3459                                               # the list of ranges
3460
3461         # Do a special case requiring fewer machine cycles when the new range
3462         # starts after the current highest point.  The Unicode input data is
3463         # structured so this is common.
3464         if ($start > $max) {
3465
3466             trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3467             return if $operation eq '-'; # Deleting a non-existing range is a
3468                                          # no-op
3469
3470             # If the new range doesn't logically extend the current final one
3471             # in the range list, create a new range at the end of the range
3472             # list.  (max cleverly is initialized to a negative number not
3473             # adjacent to 0 if the range list is empty, so even adding a range
3474             # to an empty range list starting at 0 will have this 'if'
3475             # succeed.)
3476             if ($start > $max + 1        # non-adjacent means can't extend.
3477                 || @{$r}[-1]->value ne $value # values differ, can't extend.
3478                 || @{$r}[-1]->type != $type # types differ, can't extend.
3479             ) {
3480                 push @$r, Range->new($start, $end,
3481                                      Value => $value,
3482                                      Type => $type);
3483             }
3484             else {
3485
3486                 # Here, the new range starts just after the current highest in
3487                 # the range list, and they have the same type and value.
3488                 # Extend the current range to incorporate the new one.
3489                 @{$r}[-1]->set_end($end);
3490             }
3491
3492             # This becomes the new maximum.
3493             $max{$addr} = $end;
3494
3495             return;
3496         }
3497         #local $to_trace = 0 if main::DEBUG;
3498
3499         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3500
3501         # Here, the input range isn't after the whole rest of the range list.
3502         # Most likely 'splice' will be needed.  The rest of the routine finds
3503         # the needed splice parameters, and if necessary, does the splice.
3504         # First, find the offset parameter needed by the splice function for
3505         # the input range.  Note that the input range may span multiple
3506         # existing ones, but we'll worry about that later.  For now, just find
3507         # the beginning.  If the input range is to be inserted starting in a
3508         # position not currently in the range list, it must (obviously) come
3509         # just after the range below it, and just before the range above it.
3510         # Slightly less obviously, it will occupy the position currently
3511         # occupied by the range that is to come after it.  More formally, we
3512         # are looking for the position, $i, in the array of ranges, such that:
3513         #
3514         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3515         #
3516         # (The ordered relationships within existing ranges are also shown in
3517         # the equation above).  However, if the start of the input range is
3518         # within an existing range, the splice offset should point to that
3519         # existing range's position in the list; that is $i satisfies a
3520         # somewhat different equation, namely:
3521         #
3522         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3523         #
3524         # More briefly, $start can come before or after r[$i]->start, and at
3525         # this point, we don't know which it will be.  However, these
3526         # two equations share these constraints:
3527         #
3528         #   r[$i-1]->end < $start <= r[$i]->end
3529         #
3530         # And that is good enough to find $i.
3531
3532         my $i = $self->_search_ranges($start);
3533         if (! defined $i) {
3534             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
3535             return;
3536         }
3537
3538         # The search function returns $i such that:
3539         #
3540         # r[$i-1]->end < $start <= r[$i]->end
3541         #
3542         # That means that $i points to the first range in the range list
3543         # that could possibly be affected by this operation.  We still don't
3544         # know if the start of the input range is within r[$i], or if it
3545         # points to empty space between r[$i-1] and r[$i].
3546         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3547
3548         # Special case the insertion of data that is not to replace any
3549         # existing data.
3550         if ($replace == $NO) {  # If $NO, has to be operation '+'
3551             #local $to_trace = 1 if main::DEBUG;
3552             trace "Doesn't replace" if main::DEBUG && $to_trace;
3553
3554             # Here, the new range is to take effect only on those code points
3555             # that aren't already in an existing range.  This can be done by
3556             # looking through the existing range list and finding the gaps in
3557             # the ranges that this new range affects, and then calling this
3558             # function recursively on each of those gaps, leaving untouched
3559             # anything already in the list.  Gather up a list of the changed
3560             # gaps first so that changes to the internal state as new ranges
3561             # are added won't be a problem.
3562             my @gap_list;
3563
3564             # First, if the starting point of the input range is outside an
3565             # existing one, there is a gap from there to the beginning of the
3566             # existing range -- add a span to fill the part that this new
3567             # range occupies
3568             if ($start < $r->[$i]->start) {
3569                 push @gap_list, Range->new($start,
3570                                            main::min($end,
3571                                                      $r->[$i]->start - 1),
3572                                            Type => $type);
3573                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3574             }
3575
3576             # Then look through the range list for other gaps until we reach
3577             # the highest range affected by the input one.
3578             my $j;
3579             for ($j = $i+1; $j < $range_list_size; $j++) {
3580                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3581                 last if $end < $r->[$j]->start;
3582
3583                 # If there is a gap between when this range starts and the
3584                 # previous one ends, add a span to fill it.  Note that just
3585                 # because there are two ranges doesn't mean there is a
3586                 # non-zero gap between them.  It could be that they have
3587                 # different values or types
3588                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3589                     push @gap_list,
3590                         Range->new($r->[$j-1]->end + 1,
3591                                    $r->[$j]->start - 1,
3592                                    Type => $type);
3593                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3594                 }
3595             }
3596
3597             # Here, we have either found an existing range in the range list,
3598             # beyond the area affected by the input one, or we fell off the
3599             # end of the loop because the input range affects the whole rest
3600             # of the range list.  In either case, $j is 1 higher than the
3601             # highest affected range.  If $j == $i, it means that there are no
3602             # affected ranges, that the entire insertion is in the gap between
3603             # r[$i-1], and r[$i], which we already have taken care of before
3604             # the loop.
3605             # On the other hand, if there are affected ranges, it might be
3606             # that there is a gap that needs filling after the final such
3607             # range to the end of the input range
3608             if ($r->[$j-1]->end < $end) {
3609                     push @gap_list, Range->new(main::max($start,
3610                                                          $r->[$j-1]->end + 1),
3611                                                $end,
3612                                                Type => $type);
3613                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3614             }
3615
3616             # Call recursively to fill in all the gaps.
3617             foreach my $gap (@gap_list) {
3618                 $self->_add_delete($operation,
3619                                    $gap->start,
3620                                    $gap->end,
3621                                    $value,
3622                                    Type => $type);
3623             }
3624
3625             return;
3626         }
3627
3628         # Here, we have taken care of the case where $replace is $NO.
3629         # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3630         # If inserting a multiple record, this is where it goes, before the
3631         # first (if any) existing one if inserting LIFO.  (If this is to go
3632         # afterwards, FIFO, we below move the pointer to there.)  These imply
3633         # an insertion, and no change to any existing ranges.  Note that $i
3634         # can be -1 if this new range doesn't actually duplicate any existing,
3635         # and comes at the beginning of the list.
3636         if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
3637
3638             if ($start != $end) {
3639                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
3640                 return;
3641             }
3642
3643             # If the new code point is within a current range ...
3644             if ($end >= $r->[$i]->start) {
3645
3646                 # Don't add an exact duplicate, as it isn't really a multiple
3647                 my $existing_value = $r->[$i]->value;
3648                 my $existing_type = $r->[$i]->type;
3649                 return if $value eq $existing_value && $type eq $existing_type;
3650
3651                 # If the multiple value is part of an existing range, we want
3652                 # to split up that range, so that only the single code point
3653                 # is affected.  To do this, we first call ourselves
3654                 # recursively to delete that code point from the table, having
3655                 # preserved its current data above.  Then we call ourselves
3656                 # recursively again to add the new multiple, which we know by
3657                 # the test just above is different than the current code
3658                 # point's value, so it will become a range containing a single
3659                 # code point: just itself.  Finally, we add back in the
3660                 # pre-existing code point, which will again be a single code
3661                 # point range.  Because 'i' likely will have changed as a
3662                 # result of these operations, we can't just continue on, but
3663                 # do this operation recursively as well.  If we are inserting
3664                 # LIFO, the pre-existing code point needs to go after the new
3665                 # one, so use MULTIPLE_AFTER; and vice versa.
3666                 if ($r->[$i]->start != $r->[$i]->end) {
3667                     $self->_add_delete('-', $start, $end, "");
3668                     $self->_add_delete('+', $start, $end, $value, Type => $type);
3669                     return $self->_add_delete('+',
3670                             $start, $end,
3671                             $existing_value,
3672                             Type => $existing_type,
3673                             Replace => ($replace == $MULTIPLE_BEFORE)
3674                                        ? $MULTIPLE_AFTER
3675                                        : $MULTIPLE_BEFORE);
3676                 }
3677             }
3678
3679             # If to place this new record after, move to beyond all existing
3680             # ones; but don't add this one if identical to any of them, as it
3681             # isn't really a multiple
3682             if ($replace == $MULTIPLE_AFTER) {
3683                 while ($i < @$r && $r->[$i]->start == $start) {
3684                     return if $value eq $r->[$i]->value
3685                               && $type eq $r->[$i]->type;
3686                     $i++;
3687                 }
3688             }
3689
3690             trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3691             my @return = splice @$r,
3692                                 $i,
3693                                 0,
3694                                 Range->new($start,
3695                                            $end,
3696                                            Value => $value,
3697                                            Type => $type);
3698             if (main::DEBUG && $to_trace) {
3699                 trace "After splice:";
3700                 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3701                 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3702                 trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
3703                 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3704                 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3705                 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3706             }
3707             return @return;
3708         }
3709
3710         # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
3711         # leaves delete, insert, and replace either unconditionally or if not
3712         # equivalent.  $i still points to the first potential affected range.
3713         # Now find the highest range affected, which will determine the length
3714         # parameter to splice.  (The input range can span multiple existing
3715         # ones.)  If this isn't a deletion, while we are looking through the
3716         # range list, see also if this is a replacement rather than a clean
3717         # insertion; that is if it will change the values of at least one
3718         # existing range.  Start off assuming it is an insert, until find it
3719         # isn't.
3720         my $clean_insert = $operation eq '+';
3721         my $j;        # This will point to the highest affected range
3722
3723         # For non-zero types, the standard form is the value itself;
3724         my $standard_form = ($type) ? $value : main::standardize($value);
3725
3726         for ($j = $i; $j < $range_list_size; $j++) {
3727             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3728
3729             # If find a range that it doesn't overlap into, we can stop
3730             # searching
3731             last if $end < $r->[$j]->start;
3732
3733             # Here, overlaps the range at $j.  If the values don't match,
3734             # and so far we think this is a clean insertion, it becomes a
3735             # non-clean insertion, i.e., a 'change' or 'replace' instead.
3736             if ($clean_insert) {
3737                 if ($r->[$j]->standard_form ne $standard_form) {
3738                     $clean_insert = 0;
3739                     if ($replace == $CROAK) {
3740                         main::croak("The range to add "
3741                         . sprintf("%04X", $start)
3742                         . '-'
3743                         . sprintf("%04X", $end)
3744                         . " with value '$value' overlaps an existing range $r->[$j]");
3745                     }
3746                 }
3747                 else {
3748
3749                     # Here, the two values are essentially the same.  If the
3750                     # two are actually identical, replacing wouldn't change
3751                     # anything so skip it.
3752                     my $pre_existing = $r->[$j]->value;
3753                     if ($pre_existing ne $value) {
3754
3755                         # Here the new and old standardized values are the
3756                         # same, but the non-standardized values aren't.  If
3757                         # replacing unconditionally, then replace
3758                         if( $replace == $UNCONDITIONALLY) {
3759                             $clean_insert = 0;
3760                         }
3761                         else {
3762
3763                             # Here, are replacing conditionally.  Decide to
3764                             # replace or not based on which appears to look
3765                             # the "nicest".  If one is mixed case and the
3766                             # other isn't, choose the mixed case one.
3767                             my $new_mixed = $value =~ /[A-Z]/
3768                                             && $value =~ /[a-z]/;
3769                             my $old_mixed = $pre_existing =~ /[A-Z]/
3770                                             && $pre_existing =~ /[a-z]/;
3771
3772                             if ($old_mixed != $new_mixed) {
3773                                 $clean_insert = 0 if $new_mixed;
3774                                 if (main::DEBUG && $to_trace) {
3775                                     if ($clean_insert) {
3776                                         trace "Retaining $pre_existing over $value";
3777                                     }
3778                                     else {
3779                                         trace "Replacing $pre_existing with $value";
3780                                     }
3781                                 }
3782                             }
3783                             else {
3784
3785                                 # Here casing wasn't different between the two.
3786                                 # If one has hyphens or underscores and the
3787                                 # other doesn't, choose the one with the
3788                                 # punctuation.
3789                                 my $new_punct = $value =~ /[-_]/;
3790                                 my $old_punct = $pre_existing =~ /[-_]/;
3791
3792                                 if ($old_punct != $new_punct) {
3793                                     $clean_insert = 0 if $new_punct;
3794                                     if (main::DEBUG && $to_trace) {
3795                                         if ($clean_insert) {
3796                                             trace "Retaining $pre_existing over $value";
3797                                         }
3798                                         else {
3799                                             trace "Replacing $pre_existing with $value";
3800                                         }
3801                                     }
3802                                 }   # else existing one is just as "good";
3803                                     # retain it to save cycles.
3804                             }
3805                         }
3806                     }
3807                 }
3808             }
3809         } # End of loop looking for highest affected range.
3810
3811         # Here, $j points to one beyond the highest range that this insertion
3812         # affects (hence to beyond the range list if that range is the final
3813         # one in the range list).
3814
3815         # The splice length is all the affected ranges.  Get it before
3816         # subtracting, for efficiency, so we don't have to later add 1.
3817         my $length = $j - $i;
3818
3819         $j--;        # $j now points to the highest affected range.
3820         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3821
3822         # Here, have taken care of $NO and $MULTIPLE_foo replaces.
3823         # $j points to the highest affected range.  But it can be < $i or even
3824         # -1.  These happen only if the insertion is entirely in the gap
3825         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
3826         # above exited first time through with $end < $r->[$i]->start.  (And
3827         # then we subtracted one from j)  This implies also that $start <
3828         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3829         # $start, so the entire input range is in the gap.
3830         if ($j < $i) {
3831
3832             # Here the entire input range is in the gap before $i.
3833
3834             if (main::DEBUG && $to_trace) {
3835                 if ($i) {
3836                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
3837                 }
3838                 else {
3839                     trace "Entire range is before $r->[$i]";
3840                 }
3841             }
3842             return if $operation ne '+'; # Deletion of a non-existent range is
3843                                          # a no-op
3844         }
3845         else {
3846
3847             # Here part of the input range is not in the gap before $i.  Thus,
3848             # there is at least one affected one, and $j points to the highest
3849             # such one.
3850
3851             # At this point, here is the situation:
3852             # This is not an insertion of a multiple, nor of tentative ($NO)
3853             # data.
3854             #   $i  points to the first element in the current range list that
3855             #            may be affected by this operation.  In fact, we know
3856             #            that the range at $i is affected because we are in
3857             #            the else branch of this 'if'
3858             #   $j  points to the highest affected range.
3859             # In other words,
3860             #   r[$i-1]->end < $start <= r[$i]->end
3861             # And:
3862             #   r[$i-1]->end < $start <= $end <= r[$j]->end
3863             #
3864             # Also:
3865             #   $clean_insert is a boolean which is set true if and only if
3866             #        this is a "clean insertion", i.e., not a change nor a
3867             #        deletion (multiple was handled above).
3868
3869             # We now have enough information to decide if this call is a no-op
3870             # or not.  It is a no-op if this is an insertion of already
3871             # existing data.
3872
3873             if (main::DEBUG && $to_trace && $clean_insert
3874                                          && $i == $j
3875                                          && $start >= $r->[$i]->start)
3876             {
3877                     trace "no-op";
3878             }
3879             return if $clean_insert
3880                       && $i == $j # more than one affected range => not no-op
3881
3882                       # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3883                       # Further, $start and/or $end is >= r[$i]->start
3884                       # The test below hence guarantees that
3885                       #     r[$i]->start < $start <= $end <= r[$i]->end
3886                       # This means the input range is contained entirely in
3887                       # the one at $i, so is a no-op
3888                       && $start >= $r->[$i]->start;
3889         }
3890
3891         # Here, we know that some action will have to be taken.  We have
3892         # calculated the offset and length (though adjustments may be needed)
3893         # for the splice.  Now start constructing the replacement list.
3894         my @replacement;
3895         my $splice_start = $i;
3896
3897         my $extends_below;
3898         my $extends_above;
3899
3900         # See if should extend any adjacent ranges.
3901         if ($operation eq '-') { # Don't extend deletions
3902             $extends_below = $extends_above = 0;
3903         }
3904         else {  # Here, should extend any adjacent ranges.  See if there are
3905                 # any.
3906             $extends_below = ($i > 0
3907                             # can't extend unless adjacent
3908                             && $r->[$i-1]->end == $start -1
3909                             # can't extend unless are same standard value
3910                             && $r->[$i-1]->standard_form eq $standard_form
3911                             # can't extend unless share type
3912                             && $r->[$i-1]->type == $type);
3913             $extends_above = ($j+1 < $range_list_size
3914                             && $r->[$j+1]->start == $end +1
3915                             && $r->[$j+1]->standard_form eq $standard_form
3916                             && $r->[$j+1]->type == $type);
3917         }
3918         if ($extends_below && $extends_above) { # Adds to both
3919             $splice_start--;     # start replace at element below
3920             $length += 2;        # will replace on both sides
3921             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3922
3923             # The result will fill in any gap, replacing both sides, and
3924             # create one large range.
3925             @replacement = Range->new($r->[$i-1]->start,
3926                                       $r->[$j+1]->end,
3927                                       Value => $value,
3928                                       Type => $type);
3929         }
3930         else {
3931
3932             # Here we know that the result won't just be the conglomeration of
3933             # a new range with both its adjacent neighbors.  But it could
3934             # extend one of them.
3935
3936             if ($extends_below) {
3937
3938                 # Here the new element adds to the one below, but not to the
3939                 # one above.  If inserting, and only to that one range,  can
3940                 # just change its ending to include the new one.
3941                 if ($length == 0 && $clean_insert) {
3942                     $r->[$i-1]->set_end($end);
3943                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3944                     return;
3945                 }
3946                 else {
3947                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3948                     $splice_start--;        # start replace at element below
3949                     $length++;              # will replace the element below
3950                     $start = $r->[$i-1]->start;
3951                 }
3952             }
3953             elsif ($extends_above) {
3954
3955                 # Here the new element adds to the one above, but not below.
3956                 # Mirror the code above
3957                 if ($length == 0 && $clean_insert) {
3958                     $r->[$j+1]->set_start($start);
3959                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3960                     return;
3961                 }
3962                 else {
3963                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3964                     $length++;        # will replace the element above
3965                     $end = $r->[$j+1]->end;
3966                 }
3967             }
3968
3969             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3970
3971             # Finally, here we know there will have to be a splice.
3972             # If the change or delete affects only the highest portion of the
3973             # first affected range, the range will have to be split.  The
3974             # splice will remove the whole range, but will replace it by a new
3975             # range containing just the unaffected part.  So, in this case,
3976             # add to the replacement list just this unaffected portion.
3977             if (! $extends_below
3978                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3979             {
3980                 push @replacement,
3981                     Range->new($r->[$i]->start,
3982                                $start - 1,
3983                                Value => $r->[$i]->value,
3984                                Type => $r->[$i]->type);
3985             }
3986
3987             # In the case of an insert or change, but not a delete, we have to
3988             # put in the new stuff;  this comes next.
3989             if ($operation eq '+') {
3990                 push @replacement, Range->new($start,
3991                                               $end,
3992                                               Value => $value,
3993                                               Type => $type);
3994             }
3995
3996             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3997             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3998
3999             # And finally, if we're changing or deleting only a portion of the
4000             # highest affected range, it must be split, as the lowest one was.
4001             if (! $extends_above
4002                 && $j >= 0  # Remember that j can be -1 if before first
4003                             # current element
4004                 && $end >= $r->[$j]->start
4005                 && $end < $r->[$j]->end)
4006             {
4007                 push @replacement,
4008                     Range->new($end + 1,
4009                                $r->[$j]->end,
4010                                Value => $r->[$j]->value,
4011                                Type => $r->[$j]->type);
4012             }
4013         }
4014
4015         # And do the splice, as calculated above
4016         if (main::DEBUG && $to_trace) {
4017             trace "replacing $length element(s) at $i with ";
4018             foreach my $replacement (@replacement) {
4019                 trace "    $replacement";
4020             }
4021             trace "Before splice:";
4022             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4023             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4024             trace "i  =[", $i, "]", $r->[$i];
4025             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4026             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4027         }
4028
4029         my @return = splice @$r, $splice_start, $length, @replacement;
4030
4031         if (main::DEBUG && $to_trace) {
4032             trace "After splice:";
4033             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4034             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4035             trace "i  =[", $i, "]", $r->[$i];
4036             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4037             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4038             trace "removed ", @return if @return;
4039         }
4040
4041         # An actual deletion could have changed the maximum in the list.
4042         # There was no deletion if the splice didn't return something, but
4043         # otherwise recalculate it.  This is done too rarely to worry about
4044         # performance.
4045         if ($operation eq '-' && @return) {
4046             $max{$addr} = $r->[-1]->end;
4047         }
4048         return @return;
4049     }
4050
4051     sub reset_each_range {  # reset the iterator for each_range();
4052         my $self = shift;
4053         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4054
4055         no overloading;
4056         undef $each_range_iterator{pack 'J', $self};
4057         return;
4058     }
4059
4060     sub each_range {
4061         # Iterate over each range in a range list.  Results are undefined if
4062         # the range list is changed during the iteration.
4063
4064         my $self = shift;
4065         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4066
4067         my $addr = do { no overloading; pack 'J', $self; };
4068
4069         return if $self->is_empty;
4070
4071         $each_range_iterator{$addr} = -1
4072                                 if ! defined $each_range_iterator{$addr};
4073         $each_range_iterator{$addr}++;
4074         return $ranges{$addr}->[$each_range_iterator{$addr}]
4075                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
4076         undef $each_range_iterator{$addr};
4077         return;
4078     }
4079
4080     sub count {        # Returns count of code points in range list
4081         my $self = shift;
4082         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4083
4084         my $addr = do { no overloading; pack 'J', $self; };
4085
4086         my $count = 0;
4087         foreach my $range (@{$ranges{$addr}}) {
4088             $count += $range->end - $range->start + 1;
4089         }
4090         return $count;
4091     }
4092
4093     sub delete_range {    # Delete a range
4094         my $self = shift;
4095         my $start = shift;
4096         my $end = shift;
4097
4098         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4099
4100         return $self->_add_delete('-', $start, $end, "");
4101     }
4102
4103     sub is_empty { # Returns boolean as to if a range list is empty
4104         my $self = shift;
4105         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4106
4107         no overloading;
4108         return scalar @{$ranges{pack 'J', $self}} == 0;
4109     }
4110
4111     sub hash {
4112         # Quickly returns a scalar suitable for separating tables into
4113         # buckets, i.e. it is a hash function of the contents of a table, so
4114         # there are relatively few conflicts.
4115
4116         my $self = shift;
4117         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4118
4119         my $addr = do { no overloading; pack 'J', $self; };
4120
4121         # These are quickly computable.  Return looks like 'min..max;count'
4122         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4123     }
4124 } # End closure for _Range_List_Base
4125
4126 package Range_List;
4127 use base '_Range_List_Base';
4128
4129 # A Range_List is a range list for match tables; i.e. the range values are
4130 # not significant.  Thus a number of operations can be safely added to it,
4131 # such as inversion, intersection.  Note that union is also an unsafe
4132 # operation when range values are cared about, and that method is in the base
4133 # class, not here.  But things are set up so that that method is callable only
4134 # during initialization.  Only in this derived class, is there an operation
4135 # that combines two tables.  A Range_Map can thus be used to initialize a
4136 # Range_List, and its mappings will be in the list, but are not significant to
4137 # this class.
4138
4139 sub trace { return main::trace(@_); }
4140
4141 { # Closure
4142
4143     use overload
4144         fallback => 0,
4145         '+' => sub { my $self = shift;
4146                     my $other = shift;
4147
4148                     return $self->_union($other)
4149                 },
4150         '&' => sub { my $self = shift;
4151                     my $other = shift;
4152
4153                     return $self->_intersect($other, 0);
4154                 },
4155         '~' => "_invert",
4156         '-' => "_subtract",
4157     ;
4158
4159     sub _invert {
4160         # Returns a new Range_List that gives all code points not in $self.
4161
4162         my $self = shift;
4163
4164         my $new = Range_List->new;
4165
4166         # Go through each range in the table, finding the gaps between them
4167         my $max = -1;   # Set so no gap before range beginning at 0
4168         for my $range ($self->ranges) {
4169             my $start = $range->start;
4170             my $end   = $range->end;
4171
4172             # If there is a gap before this range, the inverse will contain
4173             # that gap.
4174             if ($start > $max + 1) {
4175                 $new->add_range($max + 1, $start - 1);
4176             }
4177             $max = $end;
4178         }
4179
4180         # And finally, add the gap from the end of the table to the max
4181         # possible code point
4182         if ($max < $MAX_UNICODE_CODEPOINT) {
4183             $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
4184         }
4185         return $new;
4186     }
4187
4188     sub _subtract {
4189         # Returns a new Range_List with the argument deleted from it.  The
4190         # argument can be a single code point, a range, or something that has
4191         # a range, with the _range_list() method on it returning them
4192
4193         my $self = shift;
4194         my $other = shift;
4195         my $reversed = shift;
4196         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4197
4198         if ($reversed) {
4199             Carp::my_carp_bug("Can't cope with a "
4200              .  __PACKAGE__
4201              . " being the second parameter in a '-'.  Subtraction ignored.");
4202             return $self;
4203         }
4204
4205         my $new = Range_List->new(Initialize => $self);
4206
4207         if (! ref $other) { # Single code point
4208             $new->delete_range($other, $other);
4209         }
4210         elsif ($other->isa('Range')) {
4211             $new->delete_range($other->start, $other->end);
4212         }
4213         elsif ($other->can('_range_list')) {
4214             foreach my $range ($other->_range_list->ranges) {
4215                 $new->delete_range($range->start, $range->end);
4216             }
4217         }
4218         else {
4219             Carp::my_carp_bug("Can't cope with a "
4220                         . ref($other)
4221                         . " argument to '-'.  Subtraction ignored."
4222                         );
4223             return $self;
4224         }
4225
4226         return $new;
4227     }
4228
4229     sub _intersect {
4230         # Returns either a boolean giving whether the two inputs' range lists
4231         # intersect (overlap), or a new Range_List containing the intersection
4232         # of the two lists.  The optional final parameter being true indicates
4233         # to do the check instead of the intersection.
4234
4235         my $a_object = shift;
4236         my $b_object = shift;
4237         my $check_if_overlapping = shift;
4238         $check_if_overlapping = 0 unless defined $check_if_overlapping;
4239         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4240
4241         if (! defined $b_object) {
4242             my $message = "";
4243             $message .= $a_object->_owner_name_of if defined $a_object;
4244             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4245             return;
4246         }
4247
4248         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4249         # Thus the intersection could be much more simply be written:
4250         #   return ~(~$a_object + ~$b_object);
4251         # But, this is slower, and when taking the inverse of a large
4252         # range_size_1 table, back when such tables were always stored that
4253         # way, it became prohibitively slow, hence the code was changed to the
4254         # below
4255
4256         if ($b_object->isa('Range')) {
4257             $b_object = Range_List->new(Initialize => $b_object,
4258                                         Owner => $a_object->_owner_name_of);
4259         }
4260         $b_object = $b_object->_range_list if $b_object->can('_range_list');
4261
4262         my @a_ranges = $a_object->ranges;
4263         my @b_ranges = $b_object->ranges;
4264
4265         #local $to_trace = 1 if main::DEBUG;
4266         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4267
4268         # Start with the first range in each list
4269         my $a_i = 0;
4270         my $range_a = $a_ranges[$a_i];
4271         my $b_i = 0;
4272         my $range_b = $b_ranges[$b_i];
4273
4274         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4275                                                 if ! $check_if_overlapping;
4276
4277         # If either list is empty, there is no intersection and no overlap
4278         if (! defined $range_a || ! defined $range_b) {
4279             return $check_if_overlapping ? 0 : $new;
4280         }
4281         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4282
4283         # Otherwise, must calculate the intersection/overlap.  Start with the
4284         # very first code point in each list
4285         my $a = $range_a->start;
4286         my $b = $range_b->start;
4287
4288         # Loop through all the ranges of each list; in each iteration, $a and
4289         # $b are the current code points in their respective lists
4290         while (1) {
4291
4292             # If $a and $b are the same code point, ...
4293             if ($a == $b) {
4294
4295                 # it means the lists overlap.  If just checking for overlap
4296                 # know the answer now,
4297                 return 1 if $check_if_overlapping;
4298
4299                 # The intersection includes this code point plus anything else
4300                 # common to both current ranges.
4301                 my $start = $a;
4302                 my $end = main::min($range_a->end, $range_b->end);
4303                 if (! $check_if_overlapping) {
4304                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4305                     $new->add_range($start, $end);
4306                 }
4307
4308                 # Skip ahead to the end of the current intersect
4309                 $a = $b = $end;
4310
4311                 # If the current intersect ends at the end of either range (as
4312                 # it must for at least one of them), the next possible one
4313                 # will be the beginning code point in it's list's next range.
4314                 if ($a == $range_a->end) {
4315                     $range_a = $a_ranges[++$a_i];
4316                     last unless defined $range_a;
4317                     $a = $range_a->start;
4318                 }
4319                 if ($b == $range_b->end) {
4320                     $range_b = $b_ranges[++$b_i];
4321                     last unless defined $range_b;
4322                     $b = $range_b->start;
4323                 }
4324
4325                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4326             }
4327             elsif ($a < $b) {
4328
4329                 # Not equal, but if the range containing $a encompasses $b,
4330                 # change $a to be the middle of the range where it does equal
4331                 # $b, so the next iteration will get the intersection
4332                 if ($range_a->end >= $b) {
4333                     $a = $b;
4334                 }
4335                 else {
4336
4337                     # Here, the current range containing $a is entirely below
4338                     # $b.  Go try to find a range that could contain $b.
4339                     $a_i = $a_object->_search_ranges($b);
4340
4341                     # If no range found, quit.
4342                     last unless defined $a_i;
4343
4344                     # The search returns $a_i, such that
4345                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4346                     # Set $a to the beginning of this new range, and repeat.
4347                     $range_a = $a_ranges[$a_i];
4348                     $a = $range_a->start;
4349                 }
4350             }
4351             else { # Here, $b < $a.
4352
4353                 # Mirror image code to the leg just above
4354                 if ($range_b->end >= $a) {
4355                     $b = $a;
4356                 }
4357                 else {
4358                     $b_i = $b_object->_search_ranges($a);
4359                     last unless defined $b_i;
4360                     $range_b = $b_ranges[$b_i];
4361                     $b = $range_b->start;
4362                 }
4363             }
4364         } # End of looping through ranges.
4365
4366         # Intersection fully computed, or now know that there is no overlap
4367         return $check_if_overlapping ? 0 : $new;
4368     }
4369
4370     sub overlaps {
4371         # Returns boolean giving whether the two arguments overlap somewhere
4372
4373         my $self = shift;
4374         my $other = shift;
4375         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4376
4377         return $self->_intersect($other, 1);
4378     }
4379
4380     sub add_range {
4381         # Add a range to the list.
4382
4383         my $self = shift;
4384         my $start = shift;
4385         my $end = shift;
4386         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4387
4388         return $self->_add_delete('+', $start, $end, "");
4389     }
4390
4391     sub matches_identically_to {
4392         # Return a boolean as to whether or not two Range_Lists match identical
4393         # sets of code points.
4394
4395         my $self = shift;
4396         my $other = shift;
4397         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4398
4399         # These are ordered in increasing real time to figure out (at least
4400         # until a patch changes that and doesn't change this)
4401         return 0 if $self->max != $other->max;
4402         return 0 if $self->min != $other->min;
4403         return 0 if $self->range_count != $other->range_count;
4404         return 0 if $self->count != $other->count;
4405
4406         # Here they could be identical because all the tests above passed.
4407         # The loop below is somewhat simpler since we know they have the same
4408         # number of elements.  Compare range by range, until reach the end or
4409         # find something that differs.
4410         my @a_ranges = $self->ranges;
4411         my @b_ranges = $other->ranges;
4412         for my $i (0 .. @a_ranges - 1) {
4413             my $a = $a_ranges[$i];
4414             my $b = $b_ranges[$i];
4415             trace "self $a; other $b" if main::DEBUG && $to_trace;
4416             return 0 if ! defined $b
4417                         || $a->start != $b->start
4418                         || $a->end != $b->end;
4419         }
4420         return 1;
4421     }
4422
4423     sub is_code_point_usable {
4424         # This used only for making the test script.  See if the input
4425         # proposed trial code point is one that Perl will handle.  If second
4426         # parameter is 0, it won't select some code points for various
4427         # reasons, noted below.
4428
4429         my $code = shift;
4430         my $try_hard = shift;
4431         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4432
4433         return 0 if $code < 0;                # Never use a negative
4434
4435         # shun null.  I'm (khw) not sure why this was done, but NULL would be
4436         # the character very frequently used.
4437         return $try_hard if $code == 0x0000;
4438
4439         # shun non-character code points.
4440         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4441         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4442
4443         return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
4444         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4445
4446         return 1;
4447     }
4448
4449     sub get_valid_code_point {
4450         # Return a code point that's part of the range list.  Returns nothing
4451         # if the table is empty or we can't find a suitable code point.  This
4452         # used only for making the test script.
4453
4454         my $self = shift;
4455         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4456
4457         my $addr = do { no overloading; pack 'J', $self; };
4458
4459         # On first pass, don't choose less desirable code points; if no good
4460         # one is found, repeat, allowing a less desirable one to be selected.
4461         for my $try_hard (0, 1) {
4462
4463             # Look through all the ranges for a usable code point.
4464             for my $set (reverse $self->ranges) {
4465
4466                 # Try the edge cases first, starting with the end point of the
4467                 # range.
4468                 my $end = $set->end;
4469                 return $end if is_code_point_usable($end, $try_hard);
4470
4471                 # End point didn't, work.  Start at the beginning and try
4472                 # every one until find one that does work.
4473                 for my $trial ($set->start .. $end - 1) {
4474                     return $trial if is_code_point_usable($trial, $try_hard);
4475                 }
4476             }
4477         }
4478         return ();  # If none found, give up.
4479     }
4480
4481     sub get_invalid_code_point {
4482         # Return a code point that's not part of the table.  Returns nothing
4483         # if the table covers all code points or a suitable code point can't
4484         # be found.  This used only for making the test script.
4485
4486         my $self = shift;
4487         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4488
4489         # Just find a valid code point of the inverse, if any.
4490         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4491     }
4492 } # end closure for Range_List
4493
4494 package Range_Map;
4495 use base '_Range_List_Base';
4496
4497 # A Range_Map is a range list in which the range values (called maps) are
4498 # significant, and hence shouldn't be manipulated by our other code, which
4499 # could be ambiguous or lose things.  For example, in taking the union of two
4500 # lists, which share code points, but which have differing values, which one
4501 # has precedence in the union?
4502 # It turns out that these operations aren't really necessary for map tables,
4503 # and so this class was created to make sure they aren't accidentally
4504 # applied to them.
4505
4506 { # Closure
4507
4508     sub add_map {
4509         # Add a range containing a mapping value to the list
4510
4511         my $self = shift;
4512         # Rest of parameters passed on
4513
4514         return $self->_add_delete('+', @_);
4515     }
4516
4517     sub add_duplicate {
4518         # Adds entry to a range list which can duplicate an existing entry
4519
4520         my $self = shift;
4521         my $code_point = shift;
4522         my $value = shift;
4523         my %args = @_;
4524         my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4525         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4526
4527         return $self->add_map($code_point, $code_point,
4528                                 $value, Replace => $replace);
4529     }
4530 } # End of closure for package Range_Map
4531
4532 package _Base_Table;
4533
4534 # A table is the basic data structure that gets written out into a file for
4535 # use by the Perl core.  This is the abstract base class implementing the
4536 # common elements from the derived ones.  A list of the methods to be
4537 # furnished by an implementing class is just after the constructor.
4538
4539 sub standardize { return main::standardize($_[0]); }
4540 sub trace { return main::trace(@_); }
4541
4542 { # Closure
4543
4544     main::setup_package();
4545
4546     my %range_list;
4547     # Object containing the ranges of the table.
4548     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4549
4550     my %full_name;
4551     # The full table name.
4552     main::set_access('full_name', \%full_name, 'r');
4553
4554     my %name;
4555     # The table name, almost always shorter
4556     main::set_access('name', \%name, 'r');
4557
4558     my %short_name;
4559     # The shortest of all the aliases for this table, with underscores removed
4560     main::set_access('short_name', \%short_name);
4561
4562     my %nominal_short_name_length;
4563     # The length of short_name before removing underscores
4564     main::set_access('nominal_short_name_length',
4565                     \%nominal_short_name_length);
4566
4567     my %complete_name;
4568     # The complete name, including property.
4569     main::set_access('complete_name', \%complete_name, 'r');
4570
4571     my %property;
4572     # Parent property this table is attached to.
4573     main::set_access('property', \%property, 'r');
4574
4575     my %aliases;
4576     # Ordered list of alias objects of the table's name.  The first ones in
4577     # the list are output first in comments
4578     main::set_access('aliases', \%aliases, 'readable_array');
4579
4580     my %comment;
4581     # A comment associated with the table for human readers of the files
4582     main::set_access('comment', \%comment, 's');
4583
4584     my %description;
4585     # A comment giving a short description of the table's meaning for human
4586     # readers of the files.
4587     main::set_access('description', \%description, 'readable_array');
4588
4589     my %note;
4590     # A comment giving a short note about the table for human readers of the
4591     # files.
4592     main::set_access('note', \%note, 'readable_array');
4593
4594     my %fate;
4595     # Enum; there are a number of possibilities for what happens to this
4596     # table: it could be normal, or suppressed, or not for external use.  See
4597     # values at definition for $SUPPRESSED.
4598     main::set_access('fate', \%fate, 'r');
4599
4600     my %find_table_from_alias;
4601     # The parent property passes this pointer to a hash which this class adds
4602     # all its aliases to, so that the parent can quickly take an alias and
4603     # find this table.
4604     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4605
4606     my %locked;
4607     # After this table is made equivalent to another one; we shouldn't go
4608     # changing the contents because that could mean it's no longer equivalent
4609     main::set_access('locked', \%locked, 'r');
4610
4611     my %file_path;
4612     # This gives the final path to the file containing the table.  Each
4613     # directory in the path is an element in the array
4614     main::set_access('file_path', \%file_path, 'readable_array');
4615
4616     my %status;
4617     # What is the table's status, normal, $OBSOLETE, etc.  Enum
4618     main::set_access('status', \%status, 'r');
4619
4620     my %status_info;
4621     # A comment about its being obsolete, or whatever non normal status it has
4622     main::set_access('status_info', \%status_info, 'r');
4623
4624     my %caseless_equivalent;
4625     # The table this is equivalent to under /i matching, if any.
4626     main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4627
4628     my %range_size_1;
4629     # Is the table to be output with each range only a single code point?
4630     # This is done to avoid breaking existing code that may have come to rely
4631     # on this behavior in previous versions of this program.)
4632     main::set_access('range_size_1', \%range_size_1, 'r', 's');
4633
4634     my %perl_extension;
4635     # A boolean set iff this table is a Perl extension to the Unicode
4636     # standard.
4637     main::set_access('perl_extension', \%perl_extension, 'r');
4638
4639     my %output_range_counts;
4640     # A boolean set iff this table is to have comments written in the
4641     # output file that contain the number of code points in the range.
4642     # The constructor can override the global flag of the same name.
4643     main::set_access('output_range_counts', \%output_range_counts, 'r');
4644
4645     my %format;
4646     # The format of the entries of the table.  This is calculated from the
4647     # data in the table (or passed in the constructor).  This is an enum e.g.,
4648     # $STRING_FORMAT.  It is marked protected as it should not be generally
4649     # used to override calculations.
4650     main::set_access('format', \%format, 'r', 'p_s');
4651
4652     sub new {
4653         # All arguments are key => value pairs, which you can see below, most
4654         # of which match fields documented above.  Otherwise: Re_Pod_Entry,
4655         # OK_as_Filename, and Fuzzy apply to the names of the table, and are
4656         # documented in the Alias package
4657
4658         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4659
4660         my $class = shift;
4661
4662         my $self = bless \do { my $anonymous_scalar }, $class;
4663         my $addr = do { no overloading; pack 'J', $self; };
4664
4665         my %args = @_;
4666
4667         $name{$addr} = delete $args{'Name'};
4668         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4669         $full_name{$addr} = delete $args{'Full_Name'};
4670         my $complete_name = $complete_name{$addr}
4671                           = delete $args{'Complete_Name'};
4672         $format{$addr} = delete $args{'Format'};
4673         $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4674         $property{$addr} = delete $args{'_Property'};
4675         $range_list{$addr} = delete $args{'_Range_List'};
4676         $status{$addr} = delete $args{'Status'} || $NORMAL;
4677         $status_info{$addr} = delete $args{'_Status_Info'} || "";
4678         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4679         $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
4680         $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
4681         my $ucd = delete $args{'UCD'};
4682
4683         my $description = delete $args{'Description'};
4684         my $ok_as_filename = delete $args{'OK_as_Filename'};
4685         my $loose_match = delete $args{'Fuzzy'};
4686         my $note = delete $args{'Note'};
4687         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4688         my $perl_extension = delete $args{'Perl_Extension'};
4689
4690         # Shouldn't have any left over
4691         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4692
4693         # Can't use || above because conceivably the name could be 0, and
4694         # can't use // operator in case this program gets used in Perl 5.8
4695         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4696         $output_range_counts{$addr} = $output_range_counts if
4697                                         ! defined $output_range_counts{$addr};
4698
4699         $aliases{$addr} = [ ];
4700         $comment{$addr} = [ ];
4701         $description{$addr} = [ ];
4702         $note{$addr} = [ ];
4703         $file_path{$addr} = [ ];
4704         $locked{$addr} = "";
4705
4706         push @{$description{$addr}}, $description if $description;
4707         push @{$note{$addr}}, $note if $note;
4708
4709         if ($fate{$addr} == $PLACEHOLDER) {
4710
4711             # A placeholder table doesn't get documented, is a perl extension,
4712             # and quite likely will be empty
4713             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4714             $perl_extension = 1 if ! defined $perl_extension;
4715             $ucd = 0 if ! defined $ucd;
4716             push @tables_that_may_be_empty, $complete_name{$addr};
4717             $self->add_comment(<<END);
4718 This is a placeholder because it is not in Version $string_version of Unicode,
4719 but is needed by the Perl core to work gracefully.  Because it is not in this
4720 version of Unicode, it will not be listed in $pod_file.pod
4721 END
4722         }
4723         elsif (exists $why_suppressed{$complete_name}
4724                 # Don't suppress if overridden
4725                 && ! grep { $_ eq $complete_name{$addr} }
4726                                                     @output_mapped_properties)
4727         {
4728             $fate{$addr} = $SUPPRESSED;
4729         }
4730         elsif ($fate{$addr} == $SUPPRESSED
4731                && ! exists $why_suppressed{$property{$addr}->complete_name})
4732         {
4733             Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
4734             # perhaps Fate => [ $SUPPRESSED, "reason" ]
4735         }
4736
4737         # If hasn't set its status already, see if it is on one of the
4738         # lists of properties or tables that have particular statuses; if
4739         # not, is normal.  The lists are prioritized so the most serious
4740         # ones are checked first
4741         if (! $status{$addr}) {
4742             if (exists $why_deprecated{$complete_name}) {
4743                 $status{$addr} = $DEPRECATED;
4744             }
4745             elsif (exists $why_stabilized{$complete_name}) {
4746                 $status{$addr} = $STABILIZED;
4747             }
4748             elsif (exists $why_obsolete{$complete_name}) {
4749                 $status{$addr} = $OBSOLETE;
4750             }
4751
4752             # Existence above doesn't necessarily mean there is a message
4753             # associated with it.  Use the most serious message.
4754             if ($status{$addr}) {
4755                 if ($why_deprecated{$complete_name}) {
4756                     $status_info{$addr}
4757                                 = $why_deprecated{$complete_name};
4758                 }
4759                 elsif ($why_stabilized{$complete_name}) {
4760                     $status_info{$addr}
4761                                 = $why_stabilized{$complete_name};
4762                 }
4763                 elsif ($why_obsolete{$complete_name}) {
4764                     $status_info{$addr}
4765                                 = $why_obsolete{$complete_name};
4766                 }
4767             }
4768         }
4769
4770         $perl_extension{$addr} = $perl_extension || 0;
4771
4772         # Don't list a property by default that is internal only
4773         if ($fate{$addr} > $MAP_PROXIED) {
4774             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4775             $ucd = 0 if ! defined $ucd;
4776         }
4777         else {
4778             $ucd = 1 if ! defined $ucd;
4779         }
4780
4781         # By convention what typically gets printed only or first is what's
4782         # first in the list, so put the full name there for good output
4783         # clarity.  Other routines rely on the full name being first on the
4784         # list
4785         $self->add_alias($full_name{$addr},
4786                             OK_as_Filename => $ok_as_filename,
4787                             Fuzzy => $loose_match,
4788                             Re_Pod_Entry => $make_re_pod_entry,
4789                             Status => $status{$addr},
4790                             UCD => $ucd,
4791                             );
4792
4793         # Then comes the other name, if meaningfully different.
4794         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4795             $self->add_alias($name{$addr},
4796                             OK_as_Filename => $ok_as_filename,
4797                             Fuzzy => $loose_match,
4798                             Re_Pod_Entry => $make_re_pod_entry,
4799                             Status => $status{$addr},
4800                             UCD => $ucd,
4801                             );
4802         }
4803
4804         return $self;
4805     }
4806
4807     # Here are the methods that are required to be defined by any derived
4808     # class
4809     for my $sub (qw(
4810                     handle_special_range
4811                     append_to_body
4812                     pre_body
4813                 ))
4814                 # write() knows how to write out normal ranges, but it calls
4815                 # handle_special_range() when it encounters a non-normal one.
4816                 # append_to_body() is called by it after it has handled all
4817                 # ranges to add anything after the main portion of the table.
4818                 # And finally, pre_body() is called after all this to build up
4819                 # anything that should appear before the main portion of the
4820                 # table.  Doing it this way allows things in the middle to
4821                 # affect what should appear before the main portion of the
4822                 # table.
4823     {
4824         no strict "refs";
4825         *$sub = sub {
4826             Carp::my_carp_bug( __LINE__
4827                               . ": Must create method '$sub()' for "
4828                               . ref shift);
4829             return;
4830         }
4831     }
4832
4833     use overload
4834         fallback => 0,
4835         "." => \&main::_operator_dot,
4836         '!=' => \&main::_operator_not_equal,
4837         '==' => \&main::_operator_equal,
4838     ;
4839
4840     sub ranges {
4841         # Returns the array of ranges associated with this table.
4842
4843         no overloading;
4844         return $range_list{pack 'J', shift}->ranges;
4845     }
4846
4847     sub add_alias {
4848         # Add a synonym for this table.
4849
4850         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4851
4852         my $self = shift;
4853         my $name = shift;       # The name to add.
4854         my $pointer = shift;    # What the alias hash should point to.  For
4855                                 # map tables, this is the parent property;
4856                                 # for match tables, it is the table itself.
4857
4858         my %args = @_;
4859         my $loose_match = delete $args{'Fuzzy'};
4860
4861         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4862         $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
4863
4864         my $ok_as_filename = delete $args{'OK_as_Filename'};
4865         $ok_as_filename = 1 unless defined $ok_as_filename;
4866
4867         my $status = delete $args{'Status'};
4868         $status = $NORMAL unless defined $status;
4869
4870         # An internal name does not get documented, unless overridden by the
4871         # input.
4872         my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
4873
4874         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4875
4876         # Capitalize the first letter of the alias unless it is one of the CJK
4877         # ones which specifically begins with a lower 'k'.  Do this because
4878         # Unicode has varied whether they capitalize first letters or not, and
4879         # have later changed their minds and capitalized them, but not the
4880         # other way around.  So do it always and avoid changes from release to
4881         # release
4882         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4883
4884         my $addr = do { no overloading; pack 'J', $self; };
4885
4886         # Figure out if should be loosely matched if not already specified.
4887         if (! defined $loose_match) {
4888
4889             # Is a loose_match if isn't null, and doesn't begin with an
4890             # underscore and isn't just a number
4891             if ($name ne ""
4892                 && substr($name, 0, 1) ne '_'
4893                 && $name !~ qr{^[0-9_.+-/]+$})
4894             {
4895                 $loose_match = 1;
4896             }
4897             else {
4898                 $loose_match = 0;
4899             }
4900         }
4901
4902         # If this alias has already been defined, do nothing.
4903         return if defined $find_table_from_alias{$addr}->{$name};
4904
4905         # That includes if it is standardly equivalent to an existing alias,
4906         # in which case, add this name to the list, so won't have to search
4907         # for it again.
4908         my $standard_name = main::standardize($name);
4909         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4910             $find_table_from_alias{$addr}->{$name}
4911                         = $find_table_from_alias{$addr}->{$standard_name};
4912             return;
4913         }
4914
4915         # Set the index hash for this alias for future quick reference.
4916         $find_table_from_alias{$addr}->{$name} = $pointer;
4917         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4918         local $to_trace = 0 if main::DEBUG;
4919         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4920         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4921
4922
4923         # Put the new alias at the end of the list of aliases unless the final
4924         # element begins with an underscore (meaning it is for internal perl
4925         # use) or is all numeric, in which case, put the new one before that
4926         # one.  This floats any all-numeric or underscore-beginning aliases to
4927         # the end.  This is done so that they are listed last in output lists,
4928         # to encourage the user to use a better name (either more descriptive
4929         # or not an internal-only one) instead.  This ordering is relied on
4930         # implicitly elsewhere in this program, like in short_name()
4931         my $list = $aliases{$addr};
4932         my $insert_position = (@$list == 0
4933                                 || (substr($list->[-1]->name, 0, 1) ne '_'
4934                                     && $list->[-1]->name =~ /\D/))
4935                             ? @$list
4936                             : @$list - 1;
4937         splice @$list,
4938                 $insert_position,
4939                 0,
4940                 Alias->new($name, $loose_match, $make_re_pod_entry,
4941                                                 $ok_as_filename, $status, $ucd);
4942
4943         # This name may be shorter than any existing ones, so clear the cache
4944         # of the shortest, so will have to be recalculated.
4945         no overloading;
4946         undef $short_name{pack 'J', $self};
4947         return;
4948     }
4949
4950     sub short_name {
4951         # Returns a name suitable for use as the base part of a file name.
4952         # That is, shorter wins.  It can return undef if there is no suitable
4953         # name.  The name has all non-essential underscores removed.
4954
4955         # The optional second parameter is a reference to a scalar in which
4956         # this routine will store the length the returned name had before the
4957         # underscores were removed, or undef if the return is undef.
4958
4959         # The shortest name can change if new aliases are added.  So using
4960         # this should be deferred until after all these are added.  The code
4961         # that does that should clear this one's cache.
4962         # Any name with alphabetics is preferred over an all numeric one, even
4963         # if longer.
4964
4965         my $self = shift;
4966         my $nominal_length_ptr = shift;
4967         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4968
4969         my $addr = do { no overloading; pack 'J', $self; };
4970
4971         # For efficiency, don't recalculate, but this means that adding new
4972         # aliases could change what the shortest is, so the code that does
4973         # that needs to undef this.
4974         if (defined $short_name{$addr}) {
4975             if ($nominal_length_ptr) {
4976                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4977             }
4978             return $short_name{$addr};
4979         }
4980
4981         # Look at each alias
4982         foreach my $alias ($self->aliases()) {
4983
4984             # Don't use an alias that isn't ok to use for an external name.
4985             next if ! $alias->ok_as_filename;
4986
4987             my $name = main::Standardize($alias->name);
4988             trace $self, $name if main::DEBUG && $to_trace;
4989
4990             # Take the first one, or a shorter one that isn't numeric.  This
4991             # relies on numeric aliases always being last in the array
4992             # returned by aliases().  Any alpha one will have precedence.
4993             if (! defined $short_name{$addr}
4994                 || ($name =~ /\D/
4995                     && length($name) < length($short_name{$addr})))
4996             {
4997                 # Remove interior underscores.
4998                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4999
5000                 $nominal_short_name_length{$addr} = length $name;
5001             }
5002         }
5003
5004         # If the short name isn't a nice one, perhaps an equivalent table has
5005         # a better one.
5006         if (! defined $short_name{$addr}
5007             || $short_name{$addr} eq ""
5008             || $short_name{$addr} eq "_")
5009         {
5010             my $return;
5011             foreach my $follower ($self->children) {    # All equivalents
5012                 my $follower_name = $follower->short_name;
5013                 next unless defined $follower_name;
5014
5015                 # Anything (except undefined) is better than underscore or
5016                 # empty
5017                 if (! defined $return || $return eq "_") {
5018                     $return = $follower_name;
5019                     next;
5020                 }
5021
5022                 # If the new follower name isn't "_" and is shorter than the
5023                 # current best one, prefer the new one.
5024                 next if $follower_name eq "_";
5025                 next if length $follower_name > length $return;
5026                 $return = $follower_name;
5027             }
5028             $short_name{$addr} = $return if defined $return;
5029         }
5030
5031         # If no suitable external name return undef
5032         if (! defined $short_name{$addr}) {
5033             $$nominal_length_ptr = undef if $nominal_length_ptr;
5034             return;
5035         }
5036
5037         # Don't allow a null short name.
5038         if ($short_name{$addr} eq "") {
5039             $short_name{$addr} = '_';
5040             $nominal_short_name_length{$addr} = 1;
5041         }
5042
5043         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5044
5045         if ($nominal_length_ptr) {
5046             $$nominal_length_ptr = $nominal_short_name_length{$addr};
5047         }
5048         return $short_name{$addr};
5049     }
5050
5051     sub external_name {
5052         # Returns the external name that this table should be known by.  This
5053         # is usually the short_name, but not if the short_name is undefined,
5054         # in which case the external_name is arbitrarily set to the
5055         # underscore.
5056
5057         my $self = shift;
5058         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5059
5060         my $short = $self->short_name;
5061         return $short if defined $short;
5062
5063         return '_';
5064     }
5065
5066     sub add_description { # Adds the parameter as a short description.
5067
5068         my $self = shift;
5069         my $description = shift;
5070         chomp $description;
5071         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5072
5073         no overloading;
5074         push @{$description{pack 'J', $self}}, $description;
5075
5076         return;
5077     }
5078
5079     sub add_note { # Adds the parameter as a short note.
5080
5081         my $self = shift;
5082         my $note = shift;
5083         chomp $note;
5084         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5085
5086         no overloading;
5087         push @{$note{pack 'J', $self}}, $note;
5088
5089         return;
5090     }
5091
5092     sub add_comment { # Adds the parameter as a comment.
5093
5094         return unless $debugging_build;
5095
5096         my $self = shift;
5097         my $comment = shift;
5098         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5099
5100         chomp $comment;
5101
5102         no overloading;
5103         push @{$comment{pack 'J', $self}}, $comment;
5104
5105         return;
5106     }
5107
5108     sub comment {
5109         # Return the current comment for this table.  If called in list
5110         # context, returns the array of comments.  In scalar, returns a string
5111         # of each element joined together with a period ending each.
5112
5113         my $self = shift;
5114         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5115
5116         my $addr = do { no overloading; pack 'J', $self; };
5117         my @list = @{$comment{$addr}};
5118         return @list if wantarray;
5119         my $return = "";
5120         foreach my $sentence (@list) {
5121             $return .= '.  ' if $return;
5122             $return .= $sentence;
5123             $return =~ s/\.$//;
5124         }
5125         $return .= '.' if $return;
5126         return $return;
5127     }
5128
5129     sub initialize {
5130         # Initialize the table with the argument which is any valid
5131         # initialization for range lists.
5132
5133         my $self = shift;
5134         my $addr = do { no overloading; pack 'J', $self; };
5135         my $initialization = shift;
5136         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5137
5138         # Replace the current range list with a new one of the same exact
5139         # type.
5140         my $class = ref $range_list{$addr};
5141         $range_list{$addr} = $class->new(Owner => $self,
5142                                         Initialize => $initialization);
5143         return;
5144
5145     }
5146
5147     sub header {
5148         # The header that is output for the table in the file it is written
5149         # in.
5150
5151         my $self = shift;
5152         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5153
5154         my $return = "";
5155         $return .= $DEVELOPMENT_ONLY if $compare_versions;
5156         $return .= $HEADER;
5157         return $return;
5158     }
5159
5160     sub write {
5161         # Write a representation of the table to its file.  It calls several
5162         # functions furnished by sub-classes of this abstract base class to
5163         # handle non-normal ranges, to add stuff before the table, and at its
5164         # end.  If the table is to be written so that adjustments are
5165         # required, this does that conversion.
5166
5167         my $self = shift;
5168         my $use_adjustments = shift; # ? output in adjusted format or not
5169         my $tab_stops = shift;       # The number of tab stops over to put any
5170                                      # comment.
5171         my $suppress_value = shift;  # Optional, if the value associated with
5172                                      # a range equals this one, don't write
5173                                      # the range
5174         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5175
5176         my $addr = do { no overloading; pack 'J', $self; };
5177
5178         # Start with the header
5179         my @HEADER = $self->header;
5180
5181         # Then the comments
5182         push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5183                                                         if $comment{$addr};
5184
5185         # Things discovered processing the main body of the document may
5186         # affect what gets output before it, therefore pre_body() isn't called
5187         # until after all other processing of the table is done.
5188
5189         # The main body looks like a 'here' document.  If annotating, get rid
5190         # of the comments before passing to the caller, as some callers, such
5191         # as charnames.pm, can't cope with them.  (Outputting range counts
5192         # also introduces comments, but these don't show up in the tables that
5193         # can't cope with comments, and there aren't that many of them that
5194         # it's worth the extra real time to get rid of them).
5195         my @OUT;
5196         if ($annotate) {
5197             # Use the line below in Perls that don't have /r
5198             #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5199             push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5200         } else {
5201             push @OUT, "return <<'END';\n";
5202         }
5203
5204         if ($range_list{$addr}->is_empty) {
5205
5206             # This is a kludge for empty tables to silence a warning in
5207             # utf8.c, which can't really deal with empty tables, but it can
5208             # deal with a table that matches nothing, as the inverse of 'Any'
5209             # does.
5210             push @OUT, "!utf8::Any\n";
5211         }
5212         elsif ($self->name eq 'N'
5213
5214                # To save disk space and table cache space, avoid putting out
5215                # binary N tables, but instead create a file which just inverts
5216                # the Y table.  Since the file will still exist and occupy a
5217                # certain number of blocks, might as well output the whole
5218                # thing if it all will fit in one block.   The number of
5219                # ranges below is an approximate number for that.
5220                && ($self->property->type == $BINARY
5221                    || $self->property->type == $FORCED_BINARY)
5222                # && $self->property->tables == 2  Can't do this because the
5223                #        non-binary properties, like NFDQC aren't specifiable
5224                #        by the notation
5225                && $range_list{$addr}->ranges > 15
5226                && ! $annotate)  # Under --annotate, want to see everything
5227         {
5228             push @OUT, "!utf8::" . $self->property->name . "\n";
5229         }
5230         else {
5231             my $range_size_1 = $range_size_1{$addr};
5232             my $format;            # Used only in $annotate option
5233             my $include_name;      # Used only in $annotate option
5234
5235             if ($annotate) {
5236
5237                 # If annotating each code point, must print 1 per line.
5238                 # The variable could point to a subroutine, and we don't want
5239                 # to lose that fact, so only set if not set already
5240                 $range_size_1 = 1 if ! $range_size_1;
5241
5242                 $format = $self->format;
5243
5244                 # The name of the character is output only for tables that
5245                 # don't already include the name in the output.
5246                 my $property = $self->property;
5247                 $include_name =
5248                     !  ($property == $perl_charname
5249                         || $property == main::property_ref('Unicode_1_Name')
5250                         || $property == main::property_ref('Name')
5251                         || $property == main::property_ref('Name_Alias')
5252                        );
5253             }
5254
5255             # Values for previous time through the loop.  Initialize to
5256             # something that won't be adjacent to the first iteration;
5257             # only $previous_end matters for that.
5258             my $previous_start;
5259             my $previous_end = -2;
5260             my $previous_value;
5261
5262             # Values for next time through the portion of the loop that splits
5263             # the range.  0 in $next_start means there is no remaining portion
5264             # to deal with.
5265             my $next_start = 0;
5266             my $next_end;
5267             my $next_value;
5268             my $offset = 0;
5269
5270             # Output each range as part of the here document.
5271             RANGE:
5272             for my $set ($range_list{$addr}->ranges) {
5273                 if ($set->type != 0) {
5274                     $self->handle_special_range($set);
5275                     next RANGE;
5276                 }
5277                 my $start = $set->start;
5278                 my $end   = $set->end;
5279                 my $value  = $set->value;
5280
5281                 # Don't output ranges whose value is the one to suppress
5282                 next RANGE if defined $suppress_value
5283                               && $value eq $suppress_value;
5284
5285                 {   # This bare block encloses the scope where we may need to
5286                     # split a range (when outputting adjusteds), and each time
5287                     # through we handle the next portion of the original by
5288                     # ending the block with a 'redo'.   The values to use for
5289                     # that next time through are set up just below in the
5290                     # scalars whose names begin with '$next_'.
5291
5292                     if ($use_adjustments) {
5293
5294                         # When converting to use adjustments, we can handle
5295                         # only single element ranges.  Set up so that this
5296                         # time through the loop, we look at the first element,
5297                         # and the next time through, we start off with the
5298                         # remainder.  Thus each time through we look at the
5299                         # first element of the range
5300                         if ($end != $start) {
5301                             $next_start = $start + 1;
5302                             $next_end = $end;
5303                             $next_value = $value;
5304                             $end = $start;
5305                         }
5306
5307                         # The values for some of these tables are stored as
5308                         # hex strings.  Convert those to decimal
5309                         $value = hex($value)
5310                                     if $self->default_map eq $CODE_POINT
5311                                         && $value =~ / ^ [A-Fa-f0-9]+ $ /x;
5312
5313                         # If this range is adjacent to the previous one, and
5314                         # the values in each are integers that are also
5315                         # adjacent (differ by 1), then this range really
5316                         # extends the previous one that is already in element
5317                         # $OUT[-1].  So we pop that element, and pretend that
5318                         # the range starts with whatever it started with.
5319                         # $offset is incremented by 1 each time so that it
5320                         # gives the current offset from the first element in
5321                         # the accumulating range, and we keep in $value the
5322                         # value of that first element.
5323                         if ($start == $previous_end + 1
5324                             && $value =~ /^ -? \d+ $/xa
5325                             && $previous_value =~ /^ -? \d+ $/xa
5326                             && ($value == ($previous_value + ++$offset)))
5327                         {
5328                             pop @OUT;
5329                             $start = $previous_start;
5330                             $value = $previous_value;
5331                         }
5332                         else {
5333                             $offset = 0;
5334                         }
5335
5336                         # Save the current values for the next time through
5337                         # the loop.
5338                         $previous_start = $start;
5339                         $previous_end = $end;
5340                         $previous_value = $value;
5341                     }
5342
5343                     # If there is a range and doesn't need a single point range
5344                     # output
5345                     if ($start != $end && ! $range_size_1) {
5346                         push @OUT, sprintf "%04X\t%04X", $start, $end;
5347                         $OUT[-1] .= "\t$value" if $value ne "";
5348
5349                         # Add a comment with the size of the range, if
5350                         # requested.  Expand Tabs to make sure they all start
5351                         # in the same column, and then unexpand to use mostly
5352                         # tabs.
5353                         if (! $output_range_counts{$addr}) {
5354                             $OUT[-1] .= "\n";
5355                         }
5356                         else {
5357                             $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5358                             my $count = main::clarify_number($end - $start + 1);
5359                             use integer;
5360
5361                             my $width = $tab_stops * 8 - 1;
5362                             $OUT[-1] = sprintf("%-*s # [%s]\n",
5363                                                 $width,
5364                                                 $OUT[-1],
5365                                                 $count);
5366                             $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5367                         }
5368                     }
5369
5370                         # Here to output a single code point per line.
5371                         # If not to annotate, use the simple formats
5372                     elsif (! $annotate) {
5373
5374                         # Use any passed in subroutine to output.
5375                         if (ref $range_size_1 eq 'CODE') {
5376                             for my $i ($start .. $end) {
5377                                 push @OUT, &{$range_size_1}($i, $value);
5378                             }
5379                         }
5380                         else {
5381
5382                             # Here, caller is ok with default output.
5383                             for (my $i = $start; $i <= $end; $i++) {
5384                                 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5385                             }
5386                         }
5387                     }
5388                     else {
5389
5390                         # Here, wants annotation.
5391                         for (my $i = $start; $i <= $end; $i++) {
5392
5393                             # Get character information if don't have it already
5394                             main::populate_char_info($i)
5395                                                 if ! defined $viacode[$i];
5396                             my $type = $annotate_char_type[$i];
5397
5398                             # Figure out if should output the next code points
5399                             # as part of a range or not.  If this is not in an
5400                             # annotation range, then won't output as a range,
5401                             # so returns $i.  Otherwise use the end of the
5402                             # annotation range, but no further than the
5403                             # maximum possible end point of the loop.
5404                             my $range_end = main::min(
5405                                         $annotate_ranges->value_of($i) || $i,
5406                                         $end);
5407
5408                             # Use a range if it is a range, and either is one
5409                             # of the special annotation ranges, or the range
5410                             # is at most 3 long.  This last case causes the
5411                             # algorithmically named code points to be output
5412                             # individually in spans of at most 3, as they are
5413                             # the ones whose $type is > 0.
5414                             if ($range_end != $i
5415                                 && ( $type < 0 || $range_end - $i > 2))
5416                             {
5417                                 # Here is to output a range.  We don't allow a
5418                                 # caller-specified output format--just use the
5419                                 # standard one.
5420                                 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5421                                                                 $range_end,
5422                                                                 $value;
5423                                 my $range_name = $viacode[$i];
5424
5425                                 # For the code points which end in their hex
5426                                 # value, we eliminate that from the output
5427                                 # annotation, and capitalize only the first
5428                                 # letter of each word.
5429                                 if ($type == $CP_IN_NAME) {
5430                                     my $hex = sprintf "%04X", $i;
5431                                     $range_name =~ s/-$hex$//;
5432                                     my @words = split " ", $range_name;
5433                                     for my $word (@words) {
5434                                         $word =
5435                                           ucfirst(lc($word)) if $word ne 'CJK';
5436                                     }
5437                                     $range_name = join " ", @words;
5438                                 }
5439                                 elsif ($type == $HANGUL_SYLLABLE) {
5440                                     $range_name = "Hangul Syllable";
5441                                 }
5442
5443                                 $OUT[-1] .= " $range_name" if $range_name;
5444
5445                                 # Include the number of code points in the
5446                                 # range
5447                                 my $count =
5448                                     main::clarify_number($range_end - $i + 1);
5449                                 $OUT[-1] .= " [$count]\n";
5450
5451                                 # Skip to the end of the range
5452                                 $i = $range_end;
5453                             }
5454                             else { # Not in a range.
5455                                 my $comment = "";
5456
5457                                 # When outputting the names of each character,
5458                                 # use the character itself if printable
5459                                 $comment .= "'" . chr($i) . "' "
5460                                                             if $printable[$i];
5461
5462                                 # To make it more readable, use a minimum
5463                                 # indentation
5464                                 my $comment_indent;
5465
5466                                 # Determine the annotation
5467                                 if ($format eq $DECOMP_STRING_FORMAT) {
5468
5469                                     # This is very specialized, with the type
5470                                     # of decomposition beginning the line
5471                                     # enclosed in <...>, and the code points
5472                                     # that the code point decomposes to
5473                                     # separated by blanks.  Create two
5474                                     # strings, one of the printable
5475                                     # characters, and one of their official
5476                                     # names.
5477                                     (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5478                                     my $tostr = "";
5479                                     my $to_name = "";
5480                                     my $to_chr = "";
5481                                     foreach my $to (split " ", $map) {
5482                                         $to = CORE::hex $to;
5483                                         $to_name .= " + " if $to_name;
5484                                         $to_chr .= chr($to);
5485                                         main::populate_char_info($to)
5486                                                     if ! defined $viacode[$to];
5487                                         $to_name .=  $viacode[$to];
5488                                     }
5489
5490                                     $comment .=
5491                                     "=> '$to_chr'; $viacode[$i] => $to_name";
5492                                     $comment_indent = 25;   # Determined by
5493                                                             # experiment
5494                                 }
5495                                 else {
5496
5497                                     # Assume that any table that has hex
5498                                     # format is a mapping of one code point to
5499                                     # another.
5500                                     if ($format eq $HEX_FORMAT) {
5501                                         my $decimal_value = CORE::hex $value;
5502                                         main::populate_char_info($decimal_value)
5503                                         if ! defined $viacode[$decimal_value];
5504                                         $comment .= "=> '"
5505                                         . chr($decimal_value)
5506                                         . "'; " if $printable[$decimal_value];
5507                                     }
5508                                     $comment .= $viacode[$i] if $include_name
5509                                                             && $viacode[$i];
5510                                     if ($format eq $HEX_FORMAT) {
5511                                         my $decimal_value = CORE::hex $value;
5512                                         $comment .=
5513                                             " => $viacode[$decimal_value]"
5514                                                 if $viacode[$decimal_value];
5515                                     }
5516
5517                                     # If including the name, no need to
5518                                     # indent, as the name will already be way
5519                                     # across the line.
5520                                     $comment_indent = ($include_name) ? 0 : 60;
5521                                 }
5522
5523                                 # Use any passed in routine to output the base
5524                                 # part of the line.
5525                                 if (ref $range_size_1 eq 'CODE') {
5526                                     my $base_part=&{$range_size_1}($i, $value);
5527                                     chomp $base_part;
5528                                     push @OUT, $base_part;
5529                                 }
5530                                 else {
5531                                     push @OUT, sprintf "%04X\t\t%s", $i, $value;
5532                                 }
5533
5534                                 # And add the annotation.
5535                                 $OUT[-1] = sprintf "%-*s\t# %s",
5536                                                    $comment_indent,
5537                                                    $OUT[-1],
5538                                                    $comment
5539                                             if $comment;
5540                                 $OUT[-1] .= "\n";
5541                             }
5542                         }
5543                     }
5544
5545                     # If we split the range, set up so the next time through
5546                     # we get the remainder, and redo.
5547                     if ($next_start) {
5548                         $start = $next_start;
5549                         $end = $next_end;
5550                         $value = $next_value;
5551                         $next_start = 0;
5552                         redo;
5553                     }
5554                 }
5555             } # End of loop through all the table's ranges
5556         }
5557
5558         # Add anything that goes after the main body, but within the here
5559         # document,
5560         my $append_to_body = $self->append_to_body;
5561         push @OUT, $append_to_body if $append_to_body;
5562
5563         # And finish the here document.
5564         push @OUT, "END\n";
5565
5566         # Done with the main portion of the body.  Can now figure out what
5567         # should appear before it in the file.
5568         my $pre_body = $self->pre_body;
5569         push @HEADER, $pre_body, "\n" if $pre_body;
5570
5571         # All these files should have a .pl suffix added to them.
5572         my @file_with_pl = @{$file_path{$addr}};
5573         $file_with_pl[-1] .= '.pl';
5574
5575         main::write(\@file_with_pl,
5576                     $annotate,      # utf8 iff annotating
5577                     \@HEADER,
5578                     \@OUT);
5579         return;
5580     }
5581
5582     sub set_status {    # Set the table's status
5583         my $self = shift;
5584         my $status = shift; # The status enum value
5585         my $info = shift;   # Any message associated with it.
5586         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5587
5588         my $addr = do { no overloading; pack 'J', $self; };
5589
5590         $status{$addr} = $status;
5591         $status_info{$addr} = $info;
5592         return;
5593     }
5594
5595     sub set_fate {  # Set the fate of a table
5596         my $self = shift;
5597         my $fate = shift;
5598         my $reason = shift;
5599         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5600
5601         my $addr = do { no overloading; pack 'J', $self; };
5602
5603         return if $fate{$addr} == $fate;    # If no-op
5604
5605         # Can only change the ordinary fate, except if going to $MAP_PROXIED
5606         return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
5607
5608         $fate{$addr} = $fate;
5609
5610         # Don't document anything to do with a non-normal fated table
5611         if ($fate != $ORDINARY) {
5612             my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
5613             foreach my $alias ($self->aliases) {
5614                 $alias->set_ucd($put_in_pod);
5615
5616                 # MAP_PROXIED doesn't affect the match tables
5617                 next if $fate == $MAP_PROXIED;
5618                 $alias->set_make_re_pod_entry($put_in_pod);
5619             }
5620         }
5621
5622         # Save the reason for suppression for output
5623         if ($fate == $SUPPRESSED && defined $reason) {
5624             $why_suppressed{$complete_name{$addr}} = $reason;
5625         }
5626
5627         return;
5628     }
5629
5630     sub lock {
5631         # Don't allow changes to the table from now on.  This stores a stack
5632         # trace of where it was called, so that later attempts to modify it
5633         # can immediately show where it got locked.
5634
5635         my $self = shift;
5636         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5637
5638         my $addr = do { no overloading; pack 'J', $self; };
5639
5640         $locked{$addr} = "";
5641
5642         my $line = (caller(0))[2];
5643         my $i = 1;
5644
5645         # Accumulate the stack trace
5646         while (1) {
5647             my ($pkg, $file, $caller_line, $caller) = caller $i++;
5648
5649             last unless defined $caller;
5650
5651             $locked{$addr} .= "    called from $caller() at line $line\n";
5652             $line = $caller_line;
5653         }
5654         $locked{$addr} .= "    called from main at line $line\n";
5655
5656         return;
5657     }
5658
5659     sub carp_if_locked {
5660         # Return whether a table is locked or not, and, by the way, complain
5661         # if is locked
5662
5663         my $self = shift;
5664         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5665
5666         my $addr = do { no overloading; pack 'J', $self; };
5667
5668         return 0 if ! $locked{$addr};
5669         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5670         return 1;
5671     }
5672
5673     sub set_file_path { # Set the final directory path for this table
5674         my $self = shift;
5675         # Rest of parameters passed on
5676
5677         no overloading;
5678         @{$file_path{pack 'J', $self}} = @_;
5679         return
5680     }
5681
5682     # Accessors for the range list stored in this table.  First for
5683     # unconditional
5684     for my $sub (qw(
5685                     containing_range
5686                     contains
5687                     count
5688                     each_range
5689                     hash
5690                     is_empty
5691                     matches_identically_to
5692                     max
5693                     min
5694                     range_count
5695                     reset_each_range
5696                     type_of
5697                     value_of
5698                 ))
5699     {
5700         no strict "refs";
5701         *$sub = sub {
5702             use strict "refs";
5703             my $self = shift;
5704             return $self->_range_list->$sub(@_);
5705         }
5706     }
5707
5708     # Then for ones that should fail if locked
5709     for my $sub (qw(
5710                     delete_range
5711                 ))
5712     {
5713         no strict "refs";
5714         *$sub = sub {
5715             use strict "refs";
5716             my $self = shift;
5717
5718             return if $self->carp_if_locked;
5719             no overloading;
5720             return $self->_range_list->$sub(@_);
5721         }
5722     }
5723
5724 } # End closure
5725
5726 package Map_Table;
5727 use base '_Base_Table';
5728
5729 # A Map Table is a table that contains the mappings from code points to
5730 # values.  There are two weird cases:
5731 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5732 #    are written in the table's file at the end of the table nonetheless.  It
5733 #    requires specially constructed code to handle these; utf8.c can not read
5734 #    these in, so they should not go in $map_directory.  As of this writing,
5735 #    the only case that these happen is for named sequences used in
5736 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
5737 #    something else could come along that uses it.
5738 # 2) Specials are anything that doesn't fit syntactically into the body of the
5739 #    table.  The ranges for these have a map type of non-zero.  The code below
5740 #    knows about and handles each possible type.   In most cases, these are
5741 #    written as part of the header.
5742 #
5743 # A map table deliberately can't be manipulated at will unlike match tables.
5744 # This is because of the ambiguities having to do with what to do with
5745 # overlapping code points.  And there just isn't a need for those things;
5746 # what one wants to do is just query, add, replace, or delete mappings, plus
5747 # write the final result.
5748 # However, there is a method to get the list of possible ranges that aren't in
5749 # this table to use for defaulting missing code point mappings.  And,
5750 # map_add_or_replace_non_nulls() does allow one to add another table to this
5751 # one, but it is clearly very specialized, and defined that the other's
5752 # non-null values replace this one's if there is any overlap.
5753
5754 sub trace { return main::trace(@_); }
5755
5756 { # Closure
5757
5758     main::setup_package();
5759
5760     my %default_map;
5761     # Many input files omit some entries; this gives what the mapping for the
5762     # missing entries should be
5763     main::set_access('default_map', \%default_map, 'r');
5764
5765     my %anomalous_entries;
5766     # Things that go in the body of the table which don't fit the normal
5767     # scheme of things, like having a range.  Not much can be done with these
5768     # once there except to output them.  This was created to handle named
5769     # sequences.
5770     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5771     main::set_access('anomalous_entries',       # Append singular, read plural
5772                     \%anomalous_entries,
5773                     'readable_array');
5774
5775     my %to_output_map;
5776     # Enum as to whether or not to write out this map table, and how:
5777     #   0               don't output
5778     #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
5779     #                   it should not be removed nor its format changed.  This
5780     #                   is done for those files that have traditionally been
5781     #                   output.
5782     #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
5783     #                   with this file
5784     #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
5785     #                   outputting the actual mappings as-is, we adjust things
5786     #                   to create a much more compact table. Only those few
5787     #                   tables where the mapping is convertible at least to an
5788     #                   integer and compacting makes a big difference should
5789     #                   have this.  Hence, the default is to not do this
5790     #                   unless the table's default mapping is to $CODE_POINT,
5791     #                   and the range size is not 1.
5792     main::set_access('to_output_map', \%to_output_map, 's');
5793
5794     sub new {
5795         my $class = shift;
5796         my $name = shift;
5797
5798         my %args = @_;
5799
5800         # Optional initialization data for the table.
5801         my $initialize = delete $args{'Initialize'};
5802
5803         my $default_map = delete $args{'Default_Map'};
5804         my $property = delete $args{'_Property'};
5805         my $full_name = delete $args{'Full_Name'};
5806         my $to_output_map = delete $args{'To_Output_Map'};
5807
5808         # Rest of parameters passed on
5809
5810         my $range_list = Range_Map->new(Owner => $property);
5811
5812         my $self = $class->SUPER::new(
5813                                     Name => $name,
5814                                     Complete_Name =>  $full_name,
5815                                     Full_Name => $full_name,
5816                                     _Property => $property,
5817                                     _Range_List => $range_list,
5818                                     %args);
5819
5820         my $addr = do { no overloading; pack 'J', $self; };
5821
5822         $anomalous_entries{$addr} = [];
5823         $default_map{$addr} = $default_map;
5824         $to_output_map{$addr} = $to_output_map;
5825
5826         $self->initialize($initialize) if defined $initialize;
5827
5828         return $self;
5829     }
5830
5831     use overload
5832         fallback => 0,
5833         qw("") => "_operator_stringify",
5834     ;
5835
5836     sub _operator_stringify {
5837         my $self = shift;
5838
5839         my $name = $self->property->full_name;
5840         $name = '""' if $name eq "";
5841         return "Map table for Property '$name'";
5842     }
5843
5844     sub add_alias {
5845         # Add a synonym for this table (which means the property itself)
5846         my $self = shift;
5847         my $name = shift;
5848         # Rest of parameters passed on.
5849
5850         $self->SUPER::add_alias($name, $self->property, @_);
5851         return;
5852     }
5853
5854     sub add_map {
5855         # Add a range of code points to the list of specially-handled code
5856         # points.  $MULTI_CP is assumed if the type of special is not passed
5857         # in.
5858
5859         my $self = shift;
5860         my $lower = shift;
5861         my $upper = shift;
5862         my $string = shift;
5863         my %args = @_;
5864
5865         my $type = delete $args{'Type'} || 0;
5866         # Rest of parameters passed on
5867
5868         # Can't change the table if locked.
5869         return if $self->carp_if_locked;
5870
5871         my $addr = do { no overloading; pack 'J', $self; };
5872
5873         $self->_range_list->add_map($lower, $upper,
5874                                     $string,
5875                                     @_,
5876                                     Type => $type);
5877         return;
5878     }
5879
5880     sub append_to_body {
5881         # Adds to the written HERE document of the table's body any anomalous
5882         # entries in the table..
5883
5884         my $self = shift;
5885         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5886
5887         my $addr = do { no overloading; pack 'J', $self; };
5888
5889         return "" unless @{$anomalous_entries{$addr}};
5890         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5891     }
5892
5893     sub map_add_or_replace_non_nulls {
5894         # This adds the mappings in the table $other to $self.  Non-null
5895         # mappings from $other override those in $self.  It essentially merges
5896         # the two tables, with the second having priority except for null
5897         # mappings.
5898
5899         my $self = shift;
5900         my $other = shift;
5901         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5902
5903         return if $self->carp_if_locked;
5904
5905         if (! $other->isa(__PACKAGE__)) {
5906             Carp::my_carp_bug("$other should be a "
5907                         . __PACKAGE__
5908                         . ".  Not a '"
5909                         . ref($other)
5910                         . "'.  Not added;");
5911             return;
5912         }
5913
5914         my $addr = do { no overloading; pack 'J', $self; };
5915         my $other_addr = do { no overloading; pack 'J', $other; };
5916
5917         local $to_trace = 0 if main::DEBUG;
5918
5919         my $self_range_list = $self->_range_list;
5920         my $other_range_list = $other->_range_list;
5921         foreach my $range ($other_range_list->ranges) {
5922             my $value = $range->value;
5923             next if $value eq "";
5924             $self_range_list->_add_delete('+',
5925                                           $range->start,
5926                                           $range->end,
5927                                           $value,
5928                                           Type => $range->type,
5929                                           Replace => $UNCONDITIONALLY);
5930         }
5931
5932         return;
5933     }
5934
5935     sub set_default_map {
5936         # Define what code points that are missing from the input files should
5937         # map to
5938
5939         my $self = shift;
5940         my $map = shift;
5941         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5942
5943         my $addr = do { no overloading; pack 'J', $self; };
5944
5945         # Convert the input to the standard equivalent, if any (won't have any
5946         # for $STRING properties)
5947         my $standard = $self->_find_table_from_alias->{$map};
5948         $map = $standard->name if defined $standard;
5949
5950         # Warn if there already is a non-equivalent default map for this
5951         # property.  Note that a default map can be a ref, which means that
5952         # what it actually means is delayed until later in the program, and it
5953         # IS permissible to override it here without a message.
5954         my $default_map = $default_map{$addr};
5955         if (defined $default_map
5956             && ! ref($default_map)
5957             && $default_map ne $map
5958             && main::Standardize($map) ne $default_map)
5959         {
5960             my $property = $self->property;
5961             my $map_table = $property->table($map);
5962             my $default_table = $property->table($default_map);
5963             if (defined $map_table
5964                 && defined $default_table
5965                 && $map_table != $default_table)
5966             {
5967                 Carp::my_carp("Changing the default mapping for "
5968                             . $property
5969                             . " from $default_map to $map'");
5970             }
5971         }
5972
5973         $default_map{$addr} = $map;
5974
5975         # Don't also create any missing table for this map at this point,
5976         # because if we did, it could get done before the main table add is
5977         # done for PropValueAliases.txt; instead the caller will have to make
5978         # sure it exists, if desired.
5979         return;
5980     }
5981
5982     sub to_output_map {
5983         # Returns boolean: should we write this map table?
5984
5985         my $self = shift;
5986         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5987
5988         my $addr = do { no overloading; pack 'J', $self; };
5989
5990         # If overridden, use that
5991         return $to_output_map{$addr} if defined $to_output_map{$addr};
5992
5993         my $full_name = $self->full_name;
5994         return $global_to_output_map{$full_name}
5995                                 if defined $global_to_output_map{$full_name};
5996
5997         # If table says to output, do so; if says to suppress it, do so.
5998         my $fate = $self->fate;
5999         return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6000         return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6001         return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6002
6003         my $type = $self->property->type;
6004
6005         # Don't want to output binary map tables even for debugging.
6006         return 0 if $type == $BINARY;
6007
6008         # But do want to output string ones.  All the ones that remain to
6009         # be dealt with (i.e. which haven't explicitly been set to external)
6010         # are for internal Perl use only.  The default for those that map to
6011         # $CODE_POINT and haven't been restricted to a single element range
6012         # is to use the adjusted form.
6013         if ($type == $STRING) {
6014             return $INTERNAL_MAP if $self->range_size_1
6015                                     || $default_map{$addr} ne $CODE_POINT;
6016             return $OUTPUT_ADJUSTED;
6017         }
6018
6019         # Otherwise is an $ENUM, do output it, for Perl's purposes
6020         return $INTERNAL_MAP;
6021     }
6022
6023     sub inverse_list {
6024         # Returns a Range_List that is gaps of the current table.  That is,
6025         # the inversion
6026
6027         my $self = shift;
6028         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6029
6030         my $current = Range_List->new(Initialize => $self->_range_list,
6031                                 Owner => $self->property);
6032         return ~ $current;
6033     }
6034
6035     sub header {
6036         my $self = shift;
6037         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6038
6039         my $return = $self->SUPER::header();
6040
6041         if ($self->to_output_map >= $INTERNAL_MAP) {
6042             $return .= $INTERNAL_ONLY_HEADER;
6043         }
6044         else {
6045             my $property_name = $self->property->full_name =~ s/Legacy_//r;
6046             $return .= <<END;
6047
6048 # !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
6049
6050 # This file is for internal use by core Perl only.  It is retained for
6051 # backwards compatibility with applications that may have come to rely on it,
6052 # but its format and even its name or existence are subject to change without
6053 # notice in a future Perl version.  Don't use it directly.  Instead, its
6054 # contents are now retrievable through a stable API in the Unicode::UCD
6055 # module: Unicode::UCD::prop_invmap('$property_name').
6056 END
6057         }
6058         return $return;
6059     }
6060
6061     sub set_final_comment {
6062         # Just before output, create the comment that heads the file
6063         # containing this table.
6064
6065         return unless $debugging_build;
6066
6067         my $self = shift;
6068         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6069
6070         # No sense generating a comment if aren't going to write it out.
6071         return if ! $self->to_output_map;
6072
6073         my $addr = do { no overloading; pack 'J', $self; };
6074
6075         my $property = $self->property;
6076
6077         # Get all the possible names for this property.  Don't use any that
6078         # aren't ok for use in a file name, etc.  This is perhaps causing that
6079         # flag to do double duty, and may have to be changed in the future to
6080         # have our own flag for just this purpose; but it works now to exclude
6081         # Perl generated synonyms from the lists for properties, where the
6082         # name is always the proper Unicode one.
6083         my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6084
6085         my $count = $self->count;
6086         my $default_map = $default_map{$addr};
6087
6088         # The ranges that map to the default aren't output, so subtract that
6089         # to get those actually output.  A property with matching tables
6090         # already has the information calculated.
6091         if ($property->type != $STRING) {
6092             $count -= $property->table($default_map)->count;
6093         }
6094         elsif (defined $default_map) {
6095
6096             # But for $STRING properties, must calculate now.  Subtract the
6097             # count from each range that maps to the default.
6098             foreach my $range ($self->_range_list->ranges) {
6099                 if ($range->value eq $default_map) {
6100                     $count -= $range->end +1 - $range->start;
6101                 }
6102             }
6103
6104         }
6105
6106         # Get a  string version of $count with underscores in large numbers,
6107         # for clarity.
6108         my $string_count = main::clarify_number($count);
6109
6110         my $code_points = ($count == 1)
6111                         ? 'single code point'
6112                         : "$string_count code points";
6113
6114         my $mapping;
6115         my $these_mappings;
6116         my $are;
6117         if (@property_aliases <= 1) {
6118             $mapping = 'mapping';
6119             $these_mappings = 'this mapping';
6120             $are = 'is'
6121         }
6122         else {
6123             $mapping = 'synonymous mappings';
6124             $these_mappings = 'these mappings';
6125             $are = 'are'
6126         }
6127         my $cp;
6128         if ($count >= $MAX_UNICODE_CODEPOINTS) {
6129             $cp = "any code point in Unicode Version $string_version";
6130         }
6131         else {
6132             my $map_to;
6133             if ($default_map eq "") {
6134                 $map_to = 'the null string';
6135             }
6136             elsif ($default_map eq $CODE_POINT) {
6137                 $map_to = "itself";
6138             }
6139             else {
6140                 $map_to = "'$default_map'";
6141             }
6142             if ($count == 1) {
6143                 $cp = "the single code point";
6144             }
6145             else {
6146                 $cp = "one of the $code_points";
6147             }
6148             $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6149         }
6150
6151         my $comment = "";
6152
6153         my $status = $self->status;
6154         if ($status) {
6155             my $warn = uc $status_past_participles{$status};
6156             $comment .= <<END;
6157
6158 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
6159  All property or property=value combinations contained in this file are $warn.
6160  See $unicode_reference_url for what this means.
6161
6162 END
6163         }
6164         $comment .= "This file returns the $mapping:\n";
6165
6166         my $ucd_accessible_name = "";
6167         my $full_name = $self->property->full_name;
6168         for my $i (0 .. @property_aliases - 1) {
6169             my $name = $property_aliases[$i]->name;
6170             $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
6171             if ($property_aliases[$i]->ucd) {
6172                 if ($name eq $full_name) {
6173                     $ucd_accessible_name = $full_name;
6174                 }
6175                 elsif (! $ucd_accessible_name) {
6176                     $ucd_accessible_name = $name;
6177                 }
6178             }
6179         }
6180         $comment .= "\nwhere 'cp' is $cp.";
6181         if ($ucd_accessible_name) {
6182             $comment .= "  Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
6183         }
6184
6185         # And append any commentary already set from the actual property.
6186         $comment .= "\n\n" . $self->comment if $self->comment;
6187         if ($self->description) {
6188             $comment .= "\n\n" . join " ", $self->description;
6189         }
6190         if ($self->note) {
6191             $comment .= "\n\n" . join " ", $self->note;
6192         }
6193         $comment .= "\n";
6194
6195         if (! $self->perl_extension) {
6196             $comment .= <<END;
6197
6198 For information about what this property really means, see:
6199 $unicode_reference_url
6200 END
6201         }
6202
6203         if ($count) {        # Format differs for empty table
6204                 $comment.= "\nThe format of the ";
6205             if ($self->range_size_1) {
6206                 $comment.= <<END;
6207 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6208 is in hex; MAPPING is what CODE_POINT maps to.
6209 END
6210             }
6211             else {
6212
6213                 # There are tables which end up only having one element per
6214                 # range, but it is not worth keeping track of for making just
6215                 # this comment a little better.
6216                 $comment.= <<END;
6217 non-comment portions of the main body of lines of this file is:
6218 START\\tSTOP\\tMAPPING where START is the starting code point of the
6219 range, in hex; STOP is the ending point, or if omitted, the range has just one
6220 code point; MAPPING is what each code point between START and STOP maps to.
6221 END
6222                 if ($self->output_range_counts) {
6223                     $comment .= <<END;
6224 Numbers in comments in [brackets] indicate how many code points are in the
6225 range (omitted when the range is a single code point or if the mapping is to
6226 the null string).
6227 END
6228                 }
6229             }
6230         }
6231         $self->set_comment(main::join_lines($comment));
6232         return;
6233     }
6234
6235     my %swash_keys; # Makes sure don't duplicate swash names.
6236
6237     # The remaining variables are temporaries used while writing each table,
6238     # to output special ranges.
6239     my @multi_code_point_maps;  # Map is to more than one code point.
6240
6241     sub handle_special_range {
6242         # Called in the middle of write when it finds a range it doesn't know
6243         # how to handle.
6244
6245         my $self = shift;
6246         my $range = shift;
6247         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6248
6249         my $addr = do { no overloading; pack 'J', $self; };
6250
6251         my $type = $range->type;
6252
6253         my $low = $range->start;
6254         my $high = $range->end;
6255         my $map = $range->value;
6256
6257         # No need to output the range if it maps to the default.
6258         return if $map eq $default_map{$addr};
6259
6260         my $property = $self->property;
6261
6262         # Switch based on the map type...
6263         if ($type == $HANGUL_SYLLABLE) {
6264
6265             # These are entirely algorithmically determinable based on
6266             # some constants furnished by Unicode; for now, just set a
6267             # flag to indicate that have them.  After everything is figured
6268             # out, we will output the code that does the algorithm.  (Don't
6269             # output them if not needed because we are suppressing this
6270             # property.)
6271             $has_hangul_syllables = 1 if $property->to_output_map;
6272         }
6273         elsif ($type == $CP_IN_NAME) {
6274
6275             # Code points whose name ends in their code point are also
6276             # algorithmically determinable, but need information about the map
6277             # to do so.  Both the map and its inverse are stored in data
6278             # structures output in the file.  They are stored in the mean time
6279             # in global lists The lists will be written out later into Name.pm,
6280             # which is created only if needed.  In order to prevent duplicates
6281             # in the list, only add to them for one property, should multiple
6282             # ones need them.
6283             if ($needing_code_points_ending_in_code_point == 0) {
6284                 $needing_code_points_ending_in_code_point = $property;
6285             }
6286             if ($property == $needing_code_points_ending_in_code_point) {
6287                 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6288                 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6289
6290                 my $squeezed = $map =~ s/[-\s]+//gr;
6291                 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6292                                                                           $low;
6293                 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6294                                                                          $high;
6295
6296                 push @code_points_ending_in_code_point, { low => $low,
6297                                                         high => $high,
6298                                                         name => $map
6299                                                         };
6300             }
6301         }
6302         elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6303
6304             # Multi-code point maps and null string maps have an entry
6305             # for each code point in the range.  They use the same
6306             # output format.
6307             for my $code_point ($low .. $high) {
6308
6309                 # The pack() below can't cope with surrogates.  XXX This may
6310                 # no longer be true
6311                 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
6312                     Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
6313                     next;
6314                 }
6315
6316                 # Generate the hash entries for these in the form that
6317                 # utf8.c understands.
6318                 my $tostr = "";
6319                 my $to_name = "";
6320                 my $to_chr = "";
6321                 foreach my $to (split " ", $map) {
6322                     if ($to !~ /^$code_point_re$/) {
6323                         Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
6324                         next;
6325                     }
6326                     $tostr .= sprintf "\\x{%s}", $to;
6327                     $to = CORE::hex $to;
6328                     if ($annotate) {
6329                         $to_name .= " + " if $to_name;
6330                         $to_chr .= chr($to);
6331                         main::populate_char_info($to)
6332                                             if ! defined $viacode[$to];
6333                         $to_name .=  $viacode[$to];
6334                     }
6335                 }
6336
6337                 # I (khw) have never waded through this line to
6338                 # understand it well enough to comment it.
6339                 my $utf8 = sprintf(qq["%s" => "$tostr",],
6340                         join("", map { sprintf "\\x%02X", $_ }
6341                             unpack("U0C*", pack("U", $code_point))));
6342
6343                 # Add a comment so that a human reader can more easily
6344                 # see what's going on.
6345                 push @multi_code_point_maps,
6346                         sprintf("%-45s # U+%04X", $utf8, $code_point);
6347                 if (! $annotate) {
6348                     $multi_code_point_maps[-1] .= " => $map";
6349                 }
6350                 else {
6351                     main::populate_char_info($code_point)
6352                                     if ! defined $viacode[$code_point];
6353                     $multi_code_point_maps[-1] .= " '"
6354                         . chr($code_point)
6355                         . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6356                 }
6357             }
6358         }
6359         else {
6360             Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
6361         }
6362
6363         return;
6364     }
6365
6366     sub pre_body {
6367         # Returns the string that should be output in the file before the main
6368         # body of this table.  It isn't called until the main body is
6369         # calculated, saving a pass.  The string includes some hash entries
6370         # identifying the format of the body, and what the single value should
6371         # be for all ranges missing from it.  It also includes any code points
6372         # which have map_types that don't go in the main table.
6373
6374         my $self = shift;
6375         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6376
6377         my $addr = do { no overloading; pack 'J', $self; };
6378
6379         my $name = $self->property->swash_name;
6380
6381         # Currently there is nothing in the pre_body unless a swash is being
6382         # generated.
6383         return unless defined $name;
6384
6385         if (defined $swash_keys{$name}) {
6386             Carp::my_carp(main::join_lines(<<END
6387 Already created a swash name '$name' for $swash_keys{$name}.  This means that
6388 the same name desired for $self shouldn't be used.  Bad News.  This must be
6389 fixed before production use, but proceeding anyway
6390 END
6391             ));
6392         }
6393         $swash_keys{$name} = "$self";
6394
6395         my $pre_body = "";
6396
6397         # Here we assume we were called after have gone through the whole
6398         # file.  If we actually generated anything for each map type, add its
6399         # respective header and trailer
6400         my $specials_name = "";
6401         if (@multi_code_point_maps) {
6402             $specials_name = "utf8::ToSpec$name";
6403             $pre_body .= <<END;
6404
6405 # Some code points require special handling because their mappings are each to
6406 # multiple code points.  These do not appear in the main body, but are defined
6407 # in the hash below.
6408
6409 # Each key is the string of N bytes that together make up the UTF-8 encoding
6410 # for the code point.  (i.e. the same as looking at the code point's UTF-8
6411 # under "use bytes").  Each value is the UTF-8 of the translation, for speed.
6412 \%$specials_name = (
6413 END
6414             $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6415         }
6416
6417         my $format = $self->format;
6418
6419         my $return = "";
6420
6421         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6422         if ($output_adjusted) {
6423             if ($specials_name) {
6424                 $return .= <<END;
6425 # The mappings in the non-hash portion of this file must be modified to get the
6426 # correct values by adding the code point ordinal number to each one that is
6427 # numeric.
6428 END
6429             }
6430             else {
6431                 $return .= <<END;
6432 # The mappings must be modified to get the correct values by adding the code
6433 # point ordinal number to each one that is numeric.
6434 END
6435             }
6436         }
6437
6438         $return .= <<END;
6439
6440 # The name this swash is to be known by, with the format of the mappings in
6441 # the main body of the table, and what all code points missing from this file
6442 # map to.
6443 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6444 END
6445         if ($specials_name) {
6446             $return .= <<END;
6447 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6448 END
6449         }
6450         my $default_map = $default_map{$addr};
6451
6452         # For $CODE_POINT default maps and using adjustments, instead the default
6453         # becomes zero.
6454         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
6455                 .  (($output_adjusted && $default_map eq $CODE_POINT)
6456                    ? "0"
6457                    : $default_map)
6458                 . "';";
6459
6460         if ($default_map eq $CODE_POINT) {
6461             $return .= ' # code point maps to itself';
6462         }
6463         elsif ($default_map eq "") {
6464             $return .= ' # code point maps to the null string';
6465         }
6466         $return .= "\n";
6467
6468         $return .= $pre_body;
6469
6470         return $return;
6471     }
6472
6473     sub write {
6474         # Write the table to the file.
6475
6476         my $self = shift;
6477         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6478
6479         my $addr = do { no overloading; pack 'J', $self; };
6480
6481         # Clear the temporaries
6482         undef @multi_code_point_maps;
6483
6484         # Calculate the format of the table if not already done.
6485         my $format = $self->format;
6486         my $type = $self->property->type;
6487         my $default_map = $self->default_map;
6488         if (! defined $format) {
6489             if ($type == $BINARY) {
6490
6491                 # Don't bother checking the values, because we elsewhere
6492                 # verify that a binary table has only 2 values.
6493                 $format = $BINARY_FORMAT;
6494             }
6495             else {
6496                 my @ranges = $self->_range_list->ranges;
6497
6498                 # default an empty table based on its type and default map
6499                 if (! @ranges) {
6500
6501                     # But it turns out that the only one we can say is a
6502                     # non-string (besides binary, handled above) is when the
6503                     # table is a string and the default map is to a code point
6504                     if ($type == $STRING && $default_map eq $CODE_POINT) {
6505                         $format = $HEX_FORMAT;
6506                     }
6507                     else {
6508                         $format = $STRING_FORMAT;
6509                     }
6510                 }
6511                 else {
6512
6513                     # Start with the most restrictive format, and as we find
6514                     # something that doesn't fit with that, change to the next
6515                     # most restrictive, and so on.
6516                     $format = $DECIMAL_FORMAT;
6517                     foreach my $range (@ranges) {
6518                         next if $range->type != 0;  # Non-normal ranges don't
6519                                                     # affect the main body
6520                         my $map = $range->value;
6521                         if ($map ne $default_map) {
6522                             last if $format eq $STRING_FORMAT;  # already at
6523                                                                 # least
6524                                                                 # restrictive
6525                             $format = $INTEGER_FORMAT
6526                                                 if $format eq $DECIMAL_FORMAT
6527                                                     && $map !~ / ^ [0-9] $ /x;
6528                             $format = $FLOAT_FORMAT
6529                                             if $format eq $INTEGER_FORMAT
6530                                                 && $map !~ / ^ -? [0-9]+ $ /x;
6531                             $format = $RATIONAL_FORMAT
6532                                 if $format eq $FLOAT_FORMAT
6533                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6534                             $format = $HEX_FORMAT
6535                                 if ($format eq $RATIONAL_FORMAT
6536                                        && $map !~
6537                                            m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
6538                                         # Assume a leading zero means hex,
6539                                         # even if all digits are 0-9
6540                                     || ($format eq $INTEGER_FORMAT
6541                                         && $map =~ /^0/);
6542                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6543                                                        && $map =~ /[^0-9A-F]/;
6544                         }
6545                     }
6546                 }
6547             }
6548         } # end of calculating format
6549
6550         if ($default_map eq $CODE_POINT
6551             && $format ne $HEX_FORMAT
6552             && ! defined $self->format)    # manual settings are always
6553                                            # considered ok
6554         {
6555             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6556         }
6557
6558         # If the output is to be adjusted, the format of the table that gets
6559         # output is actually 'a' instead of whatever it is stored internally
6560         # as.
6561         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6562         if ($output_adjusted) {
6563             $format = $ADJUST_FORMAT;
6564         }
6565
6566         $self->_set_format($format);
6567
6568         return $self->SUPER::write(
6569             $output_adjusted,
6570             ($self->property == $block)
6571                 ? 7     # block file needs more tab stops
6572                 : 3,
6573             $default_map);   # don't write defaulteds
6574     }
6575
6576     # Accessors for the underlying list that should fail if locked.
6577     for my $sub (qw(
6578                     add_duplicate
6579                 ))
6580     {
6581         no strict "refs";
6582         *$sub = sub {
6583             use strict "refs";
6584             my $self = shift;
6585
6586             return if $self->carp_if_locked;
6587             return $self->_range_list->$sub(@_);
6588         }
6589     }
6590 } # End closure for Map_Table
6591
6592 package Match_Table;
6593 use base '_Base_Table';
6594
6595 # A Match table is one which is a list of all the code points that have
6596 # the same property and property value, for use in \p{property=value}
6597 # constructs in regular expressions.  It adds very little data to the base
6598 # structure, but many methods, as these lists can be combined in many ways to
6599 # form new ones.
6600 # There are only a few concepts added:
6601 # 1) Equivalents and Relatedness.
6602 #    Two tables can match the identical code points, but have different names.
6603 #    This always happens when there is a perl single form extension
6604 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
6605 #    tables are set to be related, with the Perl extension being a child, and
6606 #    the Unicode property being the parent.
6607 #
6608 #    It may be that two tables match the identical code points and we don't
6609 #    know if they are related or not.  This happens most frequently when the
6610 #    Block and Script properties have the exact range.  But note that a
6611 #    revision to Unicode could add new code points to the script, which would
6612 #    now have to be in a different block (as the block was filled, or there
6613 #    would have been 'Unknown' script code points in it and they wouldn't have
6614 #    been identical).  So we can't rely on any two properties from Unicode
6615 #    always matching the same code points from release to release, and thus
6616 #    these tables are considered coincidentally equivalent--not related.  When
6617 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
6618 #    'leader', and the others are 'equivalents'.  This concept is useful
6619 #    to minimize the number of tables written out.  Only one file is used for
6620 #    any identical set of code points, with entries in Heavy.pl mapping all
6621 #    the involved tables to it.
6622 #
6623 #    Related tables will always be identical; we set them up to be so.  Thus
6624 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
6625 #    unrelated tables.  Relatedness makes generating the documentation easier.
6626 #
6627 # 2) Complement.
6628 #    Like equivalents, two tables may be the inverses of each other, the
6629 #    intersection between them is null, and the union is every Unicode code
6630 #    point.  The two tables that occupy a binary property are necessarily like
6631 #    this.  By specifying one table as the complement of another, we can avoid
6632 #    storing it on disk (using the other table and performing a fast
6633 #    transform), and some memory and calculations.
6634 #
6635 # 3) Conflicting.  It may be that there will eventually be name clashes, with
6636 #    the same name meaning different things.  For a while, there actually were
6637 #    conflicts, but they have so far been resolved by changing Perl's or
6638 #    Unicode's definitions to match the other, but when this code was written,
6639 #    it wasn't clear that that was what was going to happen.  (Unicode changed
6640 #    because of protests during their beta period.)  Name clashes are warned
6641 #    about during compilation, and the documentation.  The generated tables
6642 #    are sane, free of name clashes, because the code suppresses the Perl
6643 #    version.  But manual intervention to decide what the actual behavior
6644 #    should be may be required should this happen.  The introductory comments
6645 #    have more to say about this.
6646
6647 sub standardize { return main::standardize($_[0]); }
6648 sub trace { return main::trace(@_); }
6649
6650
6651 { # Closure
6652
6653     main::setup_package();
6654
6655     my %leader;
6656     # The leader table of this one; initially $self.
6657     main::set_access('leader', \%leader, 'r');
6658
6659     my %equivalents;
6660     # An array of any tables that have this one as their leader
6661     main::set_access('equivalents', \%equivalents, 'readable_array');
6662
6663     my %parent;
6664     # The parent table to this one, initially $self.  This allows us to
6665     # distinguish between equivalent tables that are related (for which this
6666     # is set to), and those which may not be, but share the same output file
6667     # because they match the exact same set of code points in the current
6668     # Unicode release.
6669     main::set_access('parent', \%parent, 'r');
6670
6671     my %children;
6672     # An array of any tables that have this one as their parent
6673     main::set_access('children', \%children, 'readable_array');
6674
6675     my %conflicting;
6676     # Array of any tables that would have the same name as this one with
6677     # a different meaning.  This is used for the generated documentation.
6678     main::set_access('conflicting', \%conflicting, 'readable_array');
6679
6680     my %matches_all;
6681     # Set in the constructor for tables that are expected to match all code
6682     # points.
6683     main::set_access('matches_all', \%matches_all, 'r');
6684
6685     my %complement;
6686     # Points to the complement that this table is expressed in terms of; 0 if
6687     # none.
6688     main::set_access('complement', \%complement, 'r');
6689
6690     sub new {
6691         my $class = shift;
6692
6693         my %args = @_;
6694
6695         # The property for which this table is a listing of property values.
6696         my $property = delete $args{'_Property'};
6697
6698         my $name = delete $args{'Name'};
6699         my $full_name = delete $args{'Full_Name'};
6700         $full_name = $name if ! defined $full_name;
6701
6702         # Optional
6703         my $initialize = delete $args{'Initialize'};
6704         my $matches_all = delete $args{'Matches_All'} || 0;
6705         my $format = delete $args{'Format'};
6706         # Rest of parameters passed on.
6707
6708         my $range_list = Range_List->new(Initialize => $initialize,
6709                                          Owner => $property);
6710
6711         my $complete = $full_name;
6712         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
6713                                               # but this helps debug if it
6714                                               # does
6715         # The complete name for a match table includes it's property in a
6716         # compound form 'property=table', except if the property is the
6717         # pseudo-property, perl, in which case it is just the single form,
6718         # 'table' (If you change the '=' must also change the ':' in lots of
6719         # places in this program that assume an equal sign)
6720         $complete = $property->full_name . "=$complete" if $property != $perl;
6721
6722         my $self = $class->SUPER::new(%args,
6723                                       Name => $name,
6724                                       Complete_Name => $complete,
6725                                       Full_Name => $full_name,
6726                                       _Property => $property,
6727                                       _Range_List => $range_list,
6728                                       Format => $EMPTY_FORMAT,
6729                                       );
6730         my $addr = do { no overloading; pack 'J', $self; };
6731
6732         $conflicting{$addr} = [ ];
6733         $equivalents{$addr} = [ ];
6734         $children{$addr} = [ ];
6735         $matches_all{$addr} = $matches_all;
6736         $leader{$addr} = $self;
6737         $parent{$addr} = $self;
6738         $complement{$addr} = 0;
6739
6740         if (defined $format && $format ne $EMPTY_FORMAT) {
6741             Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
6742         }
6743
6744         return $self;
6745     }
6746
6747     # See this program's beginning comment block about overloading these.
6748     use overload
6749         fallback => 0,
6750         qw("") => "_operator_stringify",
6751         '=' => sub {
6752                     my $self = shift;
6753
6754                     return if $self->carp_if_locked;
6755                     return $self;
6756                 },
6757
6758         '+' => sub {
6759                         my $self = shift;
6760                         my $other = shift;
6761
6762                         return $self->_range_list + $other;
6763                     },
6764         '&' => sub {
6765                         my $self = shift;
6766                         my $other = shift;
6767
6768                         return $self->_range_list & $other;
6769                     },
6770         '+=' => sub {
6771                         my $self = shift;
6772                         my $other = shift;
6773
6774                         return if $self->carp_if_locked;
6775
6776                         my $addr = do { no overloading; pack 'J', $self; };
6777
6778                         if (ref $other) {
6779
6780                             # Change the range list of this table to be the
6781                             # union of the two.
6782                             $self->_set_range_list($self->_range_list
6783                                                     + $other);
6784                         }
6785                         else {    # $other is just a simple value
6786                             $self->add_range($other, $other);
6787                         }
6788                         return $self;
6789                     },
6790         '-' => sub { my $self = shift;
6791                     my $other = shift;
6792                     my $reversed = shift;
6793
6794                     if ($reversed) {
6795                         Carp::my_carp_bug("Can't cope with a "
6796                             .  __PACKAGE__
6797                             . " being the first parameter in a '-'.  Subtraction ignored.");
6798                         return;
6799                     }
6800
6801                     return $self->_range_list - $other;
6802                 },
6803         '~' => sub { my $self = shift;
6804                     return ~ $self->_range_list;
6805                 },
6806     ;
6807
6808     sub _operator_stringify {
6809         my $self = shift;
6810
6811         my $name = $self->complete_name;
6812         return "Table '$name'";
6813     }
6814
6815     sub _range_list {
6816         # Returns the range list associated with this table, which will be the
6817         # complement's if it has one.
6818
6819         my $self = shift;
6820         my $complement;
6821         if (($complement = $self->complement) != 0) {
6822             return ~ $complement->_range_list;
6823         }
6824         else {
6825             return $self->SUPER::_range_list;
6826         }
6827     }
6828
6829     sub add_alias {
6830         # Add a synonym for this table.  See the comments in the base class
6831
6832         my $self = shift;
6833         my $name = shift;
6834         # Rest of parameters passed on.
6835
6836         $self->SUPER::add_alias($name, $self, @_);
6837         return;
6838     }
6839
6840     sub add_conflicting {
6841         # Add the name of some other object to the list of ones that name
6842         # clash with this match table.
6843
6844         my $self = shift;
6845         my $conflicting_name = shift;   # The name of the conflicting object
6846         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
6847         my $conflicting_object = shift; # Optional, the conflicting object
6848                                         # itself.  This is used to
6849                                         # disambiguate the text if the input
6850                                         # name is identical to any of the
6851                                         # aliases $self is known by.
6852                                         # Sometimes the conflicting object is
6853                                         # merely hypothetical, so this has to
6854                                         # be an optional parameter.
6855         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6856
6857         my $addr = do { no overloading; pack 'J', $self; };
6858
6859         # Check if the conflicting name is exactly the same as any existing
6860         # alias in this table (as long as there is a real object there to
6861         # disambiguate with).
6862         if (defined $conflicting_object) {
6863             foreach my $alias ($self->aliases) {
6864                 if ($alias->name eq $conflicting_name) {
6865
6866                     # Here, there is an exact match.  This results in
6867                     # ambiguous comments, so disambiguate by changing the
6868                     # conflicting name to its object's complete equivalent.
6869                     $conflicting_name = $conflicting_object->complete_name;
6870                     last;
6871                 }
6872             }
6873         }
6874
6875         # Convert to the \p{...} final name
6876         $conflicting_name = "\\$p" . "{$conflicting_name}";
6877
6878         # Only add once
6879         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6880
6881         push @{$conflicting{$addr}}, $conflicting_name;
6882
6883         return;
6884     }
6885
6886     sub is_set_equivalent_to {
6887         # Return boolean of whether or not the other object is a table of this
6888         # type and has been marked equivalent to this one.
6889
6890         my $self = shift;
6891         my $other = shift;
6892         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6893
6894         return 0 if ! defined $other; # Can happen for incomplete early
6895                                       # releases
6896         unless ($other->isa(__PACKAGE__)) {
6897             my $ref_other = ref $other;
6898             my $ref_self = ref $self;
6899             Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
6900             return 0;
6901         }
6902
6903         # Two tables are equivalent if they have the same leader.
6904         no overloading;
6905         return $leader{pack 'J', $self} == $leader{pack 'J', $other};
6906         return;
6907     }
6908
6909     sub set_equivalent_to {
6910         # Set $self equivalent to the parameter table.
6911         # The required Related => 'x' parameter is a boolean indicating
6912         # whether these tables are related or not.  If related, $other becomes
6913         # the 'parent' of $self; if unrelated it becomes the 'leader'
6914         #
6915         # Related tables share all characteristics except names; equivalents
6916         # not quite so many.
6917         # If they are related, one must be a perl extension.  This is because
6918         # we can't guarantee that Unicode won't change one or the other in a
6919         # later release even if they are identical now.
6920
6921         my $self = shift;
6922         my $other = shift;
6923
6924         my %args = @_;
6925         my $related = delete $args{'Related'};
6926
6927         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6928
6929         return if ! defined $other;     # Keep on going; happens in some early
6930                                         # Unicode releases.
6931
6932         if (! defined $related) {
6933             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
6934             $related = 0;
6935         }
6936
6937         # If already are equivalent, no need to re-do it;  if subroutine
6938         # returns null, it found an error, also do nothing
6939         my $are_equivalent = $self->is_set_equivalent_to($other);
6940         return if ! defined $are_equivalent || $are_equivalent;
6941
6942         my $addr = do { no overloading; pack 'J', $self; };
6943         my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
6944
6945         if ($related) {
6946             if ($current_leader->perl_extension) {
6947                 if ($other->perl_extension) {
6948                     Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
6949                     return;
6950                 }
6951             } elsif ($self->property != $other->property    # Depending on
6952                                                             # situation, might
6953                                                             # be better to use
6954                                                             # add_alias()
6955                                                             # instead for same
6956                                                             # property
6957                      && ! $other->perl_extension)
6958             {
6959                 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
6960                 $related = 0;
6961             }
6962         }
6963
6964         if (! $self->is_empty && ! $self->matches_identically_to($other)) {
6965             Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
6966             return;
6967         }
6968
6969         my $leader = do { no overloading; pack 'J', $current_leader; };
6970         my $other_addr = do { no overloading; pack 'J', $other; };
6971
6972         # Any tables that are equivalent to or children of this table must now
6973         # instead be equivalent to or (children) to the new leader (parent),
6974         # still equivalent.  The equivalency includes their matches_all info,
6975         # and for related tables, their fate and status.
6976         # All related tables are of necessity equivalent, but the converse
6977         # isn't necessarily true
6978         my $status = $other->status;
6979         my $status_info = $other->status_info;
6980         my $fate = $other->fate;
6981         my $matches_all = $matches_all{other_addr};
6982         my $caseless_equivalent = $other->caseless_equivalent;
6983         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6984             next if $table == $other;
6985             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6986
6987             my $table_addr = do { no overloading; pack 'J', $table; };
6988             $leader{$table_addr} = $other;
6989             $matches_all{$table_addr} = $matches_all;
6990             $self->_set_range_list($other->_range_list);
6991             push @{$equivalents{$other_addr}}, $table;
6992             if ($related) {
6993                 $parent{$table_addr} = $other;
6994                 push @{$children{$other_addr}}, $table;
6995                 $table->set_status($status, $status_info);
6996
6997                 # This reason currently doesn't get exposed outside; otherwise
6998                 # would have to look up the parent's reason and use it instead.
6999                 $table->set_fate($fate, "Parent's fate");
7000
7001                 $self->set_caseless_equivalent($caseless_equivalent);
7002             }
7003         }
7004
7005         # Now that we've declared these to be equivalent, any changes to one
7006         # of the tables would invalidate that equivalency.
7007         $self->lock;
7008         $other->lock;
7009         return;
7010     }
7011
7012     sub set_complement {
7013         # Set $self to be the complement of the parameter table.  $self is
7014         # locked, as what it contains should all come from the other table.
7015
7016         my $self = shift;
7017         my $other = shift;
7018
7019         my %args = @_;
7020         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7021
7022         if ($other->complement != 0) {
7023             Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7024             return;
7025         }
7026         my $addr = do { no overloading; pack 'J', $self; };
7027         $complement{$addr} = $other;
7028         $self->lock;
7029         return;
7030     }
7031
7032     sub add_range { # Add a range to the list for this table.
7033         my $self = shift;
7034         # Rest of parameters passed on
7035
7036         return if $self->carp_if_locked;
7037         return $self->_range_list->add_range(@_);
7038     }
7039
7040     sub header {
7041         my $self = shift;
7042         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7043
7044         # All match tables are to be used only by the Perl core.
7045         return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
7046     }
7047
7048     sub pre_body {  # Does nothing for match tables.
7049         return
7050     }
7051
7052     sub append_to_body {  # Does nothing for match tables.
7053         return
7054     }
7055
7056     sub set_fate {
7057         my $self = shift;
7058         my $fate = shift;
7059         my $reason = shift;
7060         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7061
7062         $self->SUPER::set_fate($fate, $reason);
7063
7064         # All children share this fate
7065         foreach my $child ($self->children) {
7066             $child->set_fate($fate, $reason);
7067         }
7068         return;
7069     }
7070
7071     sub write {
7072         my $self = shift;
7073         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7074
7075         return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops
7076     }
7077
7078     sub set_final_comment {
7079         # This creates a comment for the file that is to hold the match table
7080         # $self.  It is somewhat convoluted to make the English read nicely,
7081         # but, heh, it's just a comment.
7082         # This should be called only with the leader match table of all the
7083         # ones that share the same file.  It lists all such tables, ordered so
7084         # that related ones are together.
7085
7086         return unless $debugging_build;
7087
7088         my $leader = shift;   # Should only be called on the leader table of
7089                               # an equivalent group
7090         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7091
7092         my $addr = do { no overloading; pack 'J', $leader; };
7093
7094         if ($leader{$addr} != $leader) {
7095             Carp::my_carp_bug(<<END
7096 set_final_comment() must be called on a leader table, which $leader is not.
7097 It is equivalent to $leader{$addr}.  No comment created
7098 END
7099             );
7100             return;
7101         }
7102
7103         # Get the number of code points matched by each of the tables in this
7104         # file, and add underscores for clarity.
7105         my $count = $leader->count;
7106         my $string_count = main::clarify_number($count);
7107
7108         my $loose_count = 0;        # how many aliases loosely matched
7109         my $compound_name = "";     # ? Are any names compound?, and if so, an
7110                                     # example
7111         my $properties_with_compound_names = 0;    # count of these
7112
7113
7114         my %flags;              # The status flags used in the file
7115         my $total_entries = 0;  # number of entries written in the comment
7116         my $matches_comment = ""; # The portion of the comment about the
7117                                   # \p{}'s
7118         my @global_comments;    # List of all the tables' comments that are
7119                                 # there before this routine was called.
7120         my $has_ucd_alias = 0;  # If there is an alias that is accessible via
7121                                 # Unicode::UCD.  If not, then don't say it is
7122                                 # in the comment
7123
7124         # Get list of all the parent tables that are equivalent to this one
7125         # (including itself).
7126         my @parents = grep { $parent{main::objaddr $_} == $_ }
7127                             main::uniques($leader, @{$equivalents{$addr}});
7128         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
7129                                               # tables
7130
7131         for my $parent (@parents) {
7132
7133             my $property = $parent->property;
7134
7135             # Special case 'N' tables in properties with two match tables when
7136             # the other is a 'Y' one.  These are likely to be binary tables,
7137             # but not necessarily.  In either case, \P{} will match the
7138             # complement of \p{}, and so if something is a synonym of \p, the
7139             # complement of that something will be the synonym of \P.  This
7140             # would be true of any property with just two match tables, not
7141             # just those whose values are Y and N; but that would require a
7142             # little extra work, and there are none such so far in Unicode.
7143             my $perl_p = 'p';        # which is it?  \p{} or \P{}
7144             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
7145
7146             if (scalar $property->tables == 2
7147                 && $parent == $property->table('N')
7148                 && defined (my $yes = $property->table('Y')))
7149             {
7150                 my $yes_addr = do { no overloading; pack 'J', $yes; };
7151                 @yes_perl_synonyms
7152                     = grep { $_->property == $perl }
7153                                     main::uniques($yes,
7154                                                 $parent{$yes_addr},
7155                                                 $parent{$yes_addr}->children);
7156
7157                 # But these synonyms are \P{} ,not \p{}
7158                 $perl_p = 'P';
7159             }
7160
7161             my @description;        # Will hold the table description
7162             my @note;               # Will hold the table notes.
7163             my @conflicting;        # Will hold the table conflicts.
7164
7165             # Look at the parent, any yes synonyms, and all the children
7166             my $parent_addr = do { no overloading; pack 'J', $parent; };
7167             for my $table ($parent,
7168                            @yes_perl_synonyms,
7169                            @{$children{$parent_addr}})
7170             {
7171                 my $table_addr = do { no overloading; pack 'J', $table; };
7172                 my $table_property = $table->property;
7173
7174                 # Tables are separated by a blank line to create a grouping.
7175                 $matches_comment .= "\n" if $matches_comment;
7176
7177                 # The table is named based on the property and value
7178                 # combination it is for, like script=greek.  But there may be
7179                 # a number of synonyms for each side, like 'sc' for 'script',
7180                 # and 'grek' for 'greek'.  Any combination of these is a valid
7181                 # name for this table.  In this case, there are three more,
7182                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
7183                 # listing all possible combinations in the comment, we make
7184                 # sure that each synonym occurs at least once, and add
7185                 # commentary that the other combinations are possible.
7186                 # Because regular expressions don't recognize things like
7187                 # \p{jsn=}, only look at non-null right-hand-sides
7188                 my @property_aliases = $table_property->aliases;
7189                 my @table_aliases = grep { $_->name ne "" } $table->aliases;
7190
7191                 # The alias lists above are already ordered in the order we
7192                 # want to output them.  To ensure that each synonym is listed,
7193                 # we must use the max of the two numbers.  But if there are no
7194                 # legal synonyms (nothing in @table_aliases), then we don't
7195                 # list anything.
7196                 my $listed_combos = (@table_aliases)
7197                                     ?  main::max(scalar @table_aliases,
7198                                                  scalar @property_aliases)
7199                                     : 0;
7200                 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7201
7202
7203                 my $property_had_compound_name = 0;
7204
7205                 for my $i (0 .. $listed_combos - 1) {
7206                     $total_entries++;
7207
7208                     # The current alias for the property is the next one on
7209                     # the list, or if beyond the end, start over.  Similarly
7210                     # for the table (\p{prop=table})
7211                     my $property_alias = $property_aliases
7212                                             [$i % @property_aliases]->name;
7213                     my $table_alias_object = $table_aliases
7214                                                         [$i % @table_aliases];
7215                     my $table_alias = $table_alias_object->name;
7216                     my $loose_match = $table_alias_object->loose_match;
7217                     $has_ucd_alias |= $table_alias_object->ucd;
7218
7219                     if ($table_alias !~ /\D/) { # Clarify large numbers.
7220                         $table_alias = main::clarify_number($table_alias)
7221                     }
7222
7223                     # Add a comment for this alias combination
7224                     my $current_match_comment;
7225                     if ($table_property == $perl) {
7226                         $current_match_comment = "\\$perl_p"
7227                                                     . "{$table_alias}";
7228                     }
7229                     else {
7230                         $current_match_comment
7231                                         = "\\p{$property_alias=$table_alias}";
7232                         $property_had_compound_name = 1;
7233                     }
7234
7235                     # Flag any abnormal status for this table.
7236                     my $flag = $property->status
7237                                 || $table->status
7238                                 || $table_alias_object->status;
7239                     $flags{$flag} = $status_past_participles{$flag} if $flag;
7240
7241                     $loose_count++;
7242
7243                     # Pretty up the comment.  Note the \b; it says don't make
7244                     # this line a continuation.
7245                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
7246                                         $flag,
7247                                         " " x 7,
7248                                         $current_match_comment);
7249                 } # End of generating the entries for this table.
7250
7251                 # Save these for output after this group of related tables.
7252                 push @description, $table->description;
7253                 push @note, $table->note;
7254                 push @conflicting, $table->conflicting;
7255
7256                 # And this for output after all the tables.
7257                 push @global_comments, $table->comment;
7258
7259                 # Compute an alternate compound name using the final property
7260                 # synonym and the first table synonym with a colon instead of
7261                 # the equal sign used elsewhere.
7262                 if ($property_had_compound_name) {
7263                     $properties_with_compound_names ++;
7264                     if (! $compound_name || @property_aliases > 1) {
7265                         $compound_name = $property_aliases[-1]->name
7266                                         . ': '
7267                                         . $table_aliases[0]->name;
7268                     }
7269                 }
7270             } # End of looping through all children of this table
7271
7272             # Here have assembled in $matches_comment all the related tables
7273             # to the current parent (preceded by the same info for all the
7274             # previous parents).  Put out information that applies to all of
7275             # the current family.
7276             if (@conflicting) {
7277
7278                 # But output the conflicting information now, as it applies to
7279                 # just this table.
7280                 my $conflicting = join ", ", @conflicting;
7281                 if ($conflicting) {
7282                     $matches_comment .= <<END;
7283
7284     Note that contrary to what you might expect, the above is NOT the same as
7285 END
7286                     $matches_comment .= "any of: " if @conflicting > 1;
7287                     $matches_comment .= "$conflicting\n";
7288                 }
7289             }
7290             if (@description) {
7291                 $matches_comment .= "\n    Meaning: "
7292                                     . join('; ', @description)
7293                                     . "\n";
7294             }
7295             if (@note) {
7296                 $matches_comment .= "\n    Note: "
7297                                     . join("\n    ", @note)
7298                                     . "\n";
7299             }
7300         } # End of looping through all tables
7301
7302
7303         my $code_points;
7304         my $match;
7305         my $any_of_these;
7306         if ($count == 1) {
7307             $match = 'matches';
7308             $code_points = 'single code point';
7309         }
7310         else {
7311             $match = 'match';
7312             $code_points = "$string_count code points";
7313         }
7314
7315         my $synonyms;
7316         my $entries;
7317         if ($total_entries == 1) {
7318             $synonyms = "";
7319             $entries = 'entry';
7320             $any_of_these = 'this'
7321         }
7322         else {
7323             $synonyms = " any of the following regular expression constructs";
7324             $entries = 'entries';
7325             $any_of_these = 'any of these'
7326         }
7327
7328         my $comment = "";
7329         if ($has_ucd_alias) {
7330             $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
7331         }
7332         if ($has_unrelated) {
7333             $comment .= <<END;
7334 This file is for tables that are not necessarily related:  To conserve
7335 resources, every table that matches the identical set of code points in this
7336 version of Unicode uses this file.  Each one is listed in a separate group
7337 below.  It could be that the tables will match the same set of code points in
7338 other Unicode releases, or it could be purely coincidence that they happen to
7339 be the same in Unicode $string_version, and hence may not in other versions.
7340
7341 END
7342         }
7343
7344         if (%flags) {
7345             foreach my $flag (sort keys %flags) {
7346                 $comment .= <<END;
7347 '$flag' below means that this form is $flags{$flag}.
7348 Consult $pod_file.pod
7349 END
7350             }
7351             $comment .= "\n";
7352         }
7353
7354         if ($total_entries == 0) {
7355             Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
7356             $comment .= <<END;
7357 This file returns the $code_points in Unicode Version $string_version for
7358 $leader, but it is inaccessible through Perl regular expressions, as
7359 "\\p{prop=}" is not recognized.
7360 END
7361
7362         } else {
7363             $comment .= <<END;
7364 This file returns the $code_points in Unicode Version $string_version that
7365 $match$synonyms:
7366
7367 $matches_comment
7368 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
7369 including if adding or subtracting white space, underscore, and hyphen
7370 characters matters or doesn't matter, and other permissible syntactic
7371 variants.  Upper/lower case distinctions never matter.
7372 END
7373
7374         }
7375         if ($compound_name) {
7376             $comment .= <<END;
7377
7378 A colon can be substituted for the equals sign, and
7379 END
7380             if ($properties_with_compound_names > 1) {
7381                 $comment .= <<END;
7382 within each group above,
7383 END
7384             }
7385             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7386
7387             # Note the \b below, it says don't make that line a continuation.
7388             $comment .= <<END;
7389 anything to the left of the equals (or colon) can be combined with anything to
7390 the right.  Thus, for example,
7391 $compound_name
7392 \bis also valid.
7393 END
7394         }
7395
7396         # And append any comment(s) from the actual tables.  They are all
7397         # gathered here, so may not read all that well.
7398         if (@global_comments) {
7399             $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7400         }
7401
7402         if ($count) {   # The format differs if no code points, and needs no
7403                         # explanation in that case
7404                 $comment.= <<END;
7405
7406 The format of the lines of this file is:
7407 END
7408             $comment.= <<END;
7409 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7410 STOP is the ending point, or if omitted, the range has just one code point.
7411 END
7412             if ($leader->output_range_counts) {
7413                 $comment .= <<END;
7414 Numbers in comments in [brackets] indicate how many code points are in the
7415 range.
7416 END
7417             }
7418         }
7419
7420         $leader->set_comment(main::join_lines($comment));
7421         return;
7422     }
7423
7424     # Accessors for the underlying list
7425     for my $sub (qw(
7426                     get_valid_code_point
7427                     get_invalid_code_point
7428                 ))
7429     {
7430         no strict "refs";
7431         *$sub = sub {
7432             use strict "refs";
7433             my $self = shift;
7434
7435             return $self->_range_list->$sub(@_);
7436         }
7437     }
7438 } # End closure for Match_Table
7439
7440 package Property;
7441
7442 # The Property class represents a Unicode property, or the $perl
7443 # pseudo-property.  It contains a map table initialized empty at construction
7444 # time, and for properties accessible through regular expressions, various
7445 # match tables, created through the add_match_table() method, and referenced
7446 # by the table('NAME') or tables() methods, the latter returning a list of all
7447 # of the match tables.  Otherwise table operations implicitly are for the map
7448 # table.
7449 #
7450 # Most of the data in the property is actually about its map table, so it
7451 # mostly just uses that table's accessors for most methods.  The two could
7452 # have been combined into one object, but for clarity because of their
7453 # differing semantics, they have been kept separate.  It could be argued that
7454 # the 'file' and 'directory' fields should be kept with the map table.
7455 #
7456 # Each property has a type.  This can be set in the constructor, or in the
7457 # set_type accessor, but mostly it is figured out by the data.  Every property
7458 # starts with unknown type, overridden by a parameter to the constructor, or
7459 # as match tables are added, or ranges added to the map table, the data is
7460 # inspected, and the type changed.  After the table is mostly or entirely
7461 # filled, compute_type() should be called to finalize they analysis.
7462 #
7463 # There are very few operations defined.  One can safely remove a range from
7464 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7465 # table to this one, replacing any in the intersection of the two.
7466
7467 sub standardize { return main::standardize($_[0]); }
7468 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7469
7470 {   # Closure
7471
7472     # This hash will contain as keys, all the aliases of all properties, and
7473     # as values, pointers to their respective property objects.  This allows
7474     # quick look-up of a property from any of its names.
7475     my %alias_to_property_of;
7476
7477     sub dump_alias_to_property_of {
7478         # For debugging
7479
7480         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7481         return;
7482     }
7483
7484     sub property_ref {
7485         # This is a package subroutine, not called as a method.
7486         # If the single parameter is a literal '*' it returns a list of all
7487         # defined properties.
7488         # Otherwise, the single parameter is a name, and it returns a pointer
7489         # to the corresponding property object, or undef if none.
7490         #
7491         # Properties can have several different names.  The 'standard' form of
7492         # each of them is stored in %alias_to_property_of as they are defined.
7493         # But it's possible that this subroutine will be called with some
7494         # variant, so if the initial lookup fails, it is repeated with the
7495         # standardized form of the input name.  If found, besides returning the
7496         # result, the input name is added to the list so future calls won't
7497         # have to do the conversion again.
7498
7499         my $name = shift;
7500
7501         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7502
7503         if (! defined $name) {
7504             Carp::my_carp_bug("Undefined input property.  No action taken.");
7505             return;
7506         }
7507
7508         return main::uniques(values %alias_to_property_of) if $name eq '*';
7509
7510         # Return cached result if have it.
7511         my $result = $alias_to_property_of{$name};
7512         return $result if defined $result;
7513
7514         # Convert the input to standard form.
7515         my $standard_name = standardize($name);
7516
7517         $result = $alias_to_property_of{$standard_name};
7518         return unless defined $result;        # Don't cache undefs
7519
7520         # Cache the result before returning it.
7521         $alias_to_property_of{$name} = $result;
7522         return $result;
7523     }
7524
7525
7526     main::setup_package();
7527
7528     my %map;
7529     # A pointer to the map table object for this property
7530     main::set_access('map', \%map);
7531
7532     my %full_name;
7533     # The property's full name.  This is a duplicate of the copy kept in the
7534     # map table, but is needed because stringify needs it during
7535     # construction of the map table, and then would have a chicken before egg
7536     # problem.
7537     main::set_access('full_name', \%full_name, 'r');
7538
7539     my %table_ref;
7540     # This hash will contain as keys, all the aliases of any match tables
7541     # attached to this property, and as values, the pointers to their
7542     # respective tables.  This allows quick look-up of a table from any of its
7543     # names.
7544     main::set_access('table_ref', \%table_ref);
7545
7546     my %type;
7547     # The type of the property, $ENUM, $BINARY, etc
7548     main::set_access('type', \%type, 'r');
7549
7550     my %file;
7551     # The filename where the map table will go (if actually written).
7552     # Normally defaulted, but can be overridden.
7553     main::set_access('file', \%file, 'r', 's');
7554
7555     my %directory;
7556     # The directory where the map table will go (if actually written).
7557     # Normally defaulted, but can be overridden.
7558     main::set_access('directory', \%directory, 's');
7559
7560     my %pseudo_map_type;
7561     # This is used to affect the calculation of the map types for all the
7562     # ranges in the table.  It should be set to one of the values that signify
7563     # to alter the calculation.
7564     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7565
7566     my %has_only_code_point_maps;
7567     # A boolean used to help in computing the type of data in the map table.
7568     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7569
7570     my %unique_maps;
7571     # A list of the first few distinct mappings this property has.  This is
7572     # used to disambiguate between binary and enum property types, so don't
7573     # have to keep more than three.
7574     main::set_access('unique_maps', \%unique_maps);
7575
7576     my %pre_declared_maps;
7577     # A boolean that gives whether the input data should declare all the
7578     # tables used, or not.  If the former, unknown ones raise a warning.
7579     main::set_access('pre_declared_maps',
7580                                     \%pre_declared_maps, 'r', 's');
7581
7582     sub new {
7583         # The only required parameter is the positionally first, name.  All
7584         # other parameters are key => value pairs.  See the documentation just
7585         # above for the meanings of the ones not passed directly on to the map
7586         # table constructor.
7587
7588         my $class = shift;
7589         my $name = shift || "";
7590
7591         my $self = property_ref($name);
7592         if (defined $self) {
7593             my $options_string = join ", ", @_;
7594             $options_string = ".  Ignoring options $options_string" if $options_string;
7595             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
7596             return $self;
7597         }
7598
7599         my %args = @_;
7600
7601         $self = bless \do { my $anonymous_scalar }, $class;
7602         my $addr = do { no overloading; pack 'J', $self; };
7603
7604         $directory{$addr} = delete $args{'Directory'};
7605         $file{$addr} = delete $args{'File'};
7606         $full_name{$addr} = delete $args{'Full_Name'} || $name;
7607         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7608         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
7609         $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7610                                     # Starting in this release, property
7611                                     # values should be defined for all
7612                                     # properties, except those overriding this
7613                                     // $v_version ge v5.1.0;
7614
7615         # Rest of parameters passed on.
7616
7617         $has_only_code_point_maps{$addr} = 1;
7618         $table_ref{$addr} = { };
7619         $unique_maps{$addr} = { };
7620
7621         $map{$addr} = Map_Table->new($name,
7622                                     Full_Name => $full_name{$addr},
7623                                     _Alias_Hash => \%alias_to_property_of,
7624                                     _Property => $self,
7625                                     %args);
7626         return $self;
7627     }
7628
7629     # See this program's beginning comment block about overloading the copy
7630     # constructor.  Few operations are defined on properties, but a couple are
7631     # useful.  It is safe to take the inverse of a property, and to remove a
7632     # single code point from it.
7633     use overload
7634         fallback => 0,
7635         qw("") => "_operator_stringify",
7636         "." => \&main::_operator_dot,
7637         '==' => \&main::_operator_equal,
7638         '!=' => \&main::_operator_not_equal,
7639         '=' => sub { return shift },
7640         '-=' => "_minus_and_equal",
7641     ;
7642
7643     sub _operator_stringify {
7644         return "Property '" .  shift->full_name . "'";
7645     }
7646
7647     sub _minus_and_equal {
7648         # Remove a single code point from the map table of a property.
7649
7650         my $self = shift;
7651         my $other = shift;
7652         my $reversed = shift;
7653         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7654
7655         if (ref $other) {
7656             Carp::my_carp_bug("Can't cope with a "
7657                         . ref($other)
7658                         . " argument to '-='.  Subtraction ignored.");
7659             return $self;
7660         }
7661         elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
7662             Carp::my_carp_bug("Can't cope with a "
7663             .  __PACKAGE__
7664             . " being the first parameter in a '-='.  Subtraction ignored.");
7665             return $self;
7666         }
7667         else {
7668             no overloading;
7669             $map{pack 'J', $self}->delete_range($other, $other);
7670         }
7671         return $self;
7672     }
7673
7674     sub add_match_table {
7675         # Add a new match table for this property, with name given by the
7676         # parameter.  It returns a pointer to the table.
7677
7678         my $self = shift;
7679         my $name = shift;
7680         my %args = @_;
7681
7682         my $addr = do { no overloading; pack 'J', $self; };
7683
7684         my $table = $table_ref{$addr}{$name};
7685         my $standard_name = main::standardize($name);
7686         if (defined $table
7687             || (defined ($table = $table_ref{$addr}{$standard_name})))
7688         {
7689             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
7690             $table_ref{$addr}{$name} = $table;
7691             return $table;
7692         }
7693         else {
7694
7695             # See if this is a perl extension, if not passed in.
7696             my $perl_extension = delete $args{'Perl_Extension'};
7697             $perl_extension
7698                         = $self->perl_extension if ! defined $perl_extension;
7699
7700             $table = Match_Table->new(
7701                                 Name => $name,
7702                                 Perl_Extension => $perl_extension,
7703                                 _Alias_Hash => $table_ref{$addr},
7704                                 _Property => $self,
7705
7706                                 # gets property's fate and status by default
7707                                 Fate => $self->fate,
7708                                 Status => $self->status,
7709                                 _Status_Info => $self->status_info,
7710                                 %args);
7711             return unless defined $table;
7712         }
7713
7714         # Save the names for quick look up
7715         $table_ref{$addr}{$standard_name} = $table;
7716         $table_ref{$addr}{$name} = $table;
7717
7718         # Perhaps we can figure out the type of this property based on the
7719         # fact of adding this match table.  First, string properties don't
7720         # have match tables; second, a binary property can't have 3 match
7721         # tables
7722         if ($type{$addr} == $UNKNOWN) {
7723             $type{$addr} = $NON_STRING;
7724         }
7725         elsif ($type{$addr} == $STRING) {
7726             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
7727             $type{$addr} = $NON_STRING;
7728         }
7729         elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
7730             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7731                 && $type{$addr} == $BINARY)
7732             {
7733                 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
7734                 $type{$addr} = $ENUM;
7735             }
7736         }
7737
7738         return $table;
7739     }
7740
7741     sub delete_match_table {
7742         # Delete the table referred to by $2 from the property $1.
7743
7744         my $self = shift;
7745         my $table_to_remove = shift;
7746         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7747
7748         my $addr = do { no overloading; pack 'J', $self; };
7749
7750         # Remove all names that refer to it.
7751         foreach my $key (keys %{$table_ref{$addr}}) {
7752             delete $table_ref{$addr}{$key}
7753                                 if $table_ref{$addr}{$key} == $table_to_remove;
7754         }
7755
7756         $table_to_remove->DESTROY;
7757         return;
7758     }
7759
7760     sub table {
7761         # Return a pointer to the match table (with name given by the
7762         # parameter) associated with this property; undef if none.
7763
7764         my $self = shift;
7765         my $name = shift;
7766         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7767
7768         my $addr = do { no overloading; pack 'J', $self; };
7769
7770         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7771
7772         # If quick look-up failed, try again using the standard form of the
7773         # input name.  If that succeeds, cache the result before returning so
7774         # won't have to standardize this input name again.
7775         my $standard_name = main::standardize($name);
7776         return unless defined $table_ref{$addr}{$standard_name};
7777
7778         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7779         return $table_ref{$addr}{$name};
7780     }
7781
7782     sub tables {
7783         # Return a list of pointers to all the match tables attached to this
7784         # property
7785
7786         no overloading;
7787         return main::uniques(values %{$table_ref{pack 'J', shift}});
7788     }
7789
7790     sub directory {
7791         # Returns the directory the map table for this property should be
7792         # output in.  If a specific directory has been specified, that has
7793         # priority;  'undef' is returned if the type isn't defined;
7794         # or $map_directory for everything else.
7795
7796         my $addr = do { no overloading; pack 'J', shift; };
7797
7798         return $directory{$addr} if defined $directory{$addr};
7799         return undef if $type{$addr} == $UNKNOWN;
7800         return $map_directory;
7801     }
7802
7803     sub swash_name {
7804         # Return the name that is used to both:
7805         #   1)  Name the file that the map table is written to.
7806         #   2)  The name of swash related stuff inside that file.
7807         # The reason for this is that the Perl core historically has used
7808         # certain names that aren't the same as the Unicode property names.
7809         # To continue using these, $file is hard-coded in this file for those,
7810         # but otherwise the standard name is used.  This is different from the
7811         # external_name, so that the rest of the files, like in lib can use
7812         # the standard name always, without regard to historical precedent.
7813
7814         my $self = shift;
7815         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7816
7817         my $addr = do { no overloading; pack 'J', $self; };
7818
7819         # Swash names are used only on regular map tables; otherwise there
7820         # should be no access to the property map table from other parts of
7821         # Perl.
7822         return if $map{$addr}->fate != $ORDINARY;
7823
7824         return $file{$addr} if defined $file{$addr};
7825         return $map{$addr}->external_name;
7826     }
7827
7828     sub to_create_match_tables {
7829         # Returns a boolean as to whether or not match tables should be
7830         # created for this property.
7831
7832         my $self = shift;
7833         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7834
7835         # The whole point of this pseudo property is match tables.
7836         return 1 if $self == $perl;
7837
7838         my $addr = do { no overloading; pack 'J', $self; };
7839
7840         # Don't generate tables of code points that match the property values
7841         # of a string property.  Such a list would most likely have many
7842         # property values, each with just one or very few code points mapping
7843         # to it.
7844         return 0 if $type{$addr} == $STRING;
7845
7846         # Don't generate anything for unimplemented properties.
7847         return 0 if grep { $self->complete_name eq $_ }
7848                                                     @unimplemented_properties;
7849         # Otherwise, do.
7850         return 1;
7851     }
7852
7853     sub property_add_or_replace_non_nulls {
7854         # This adds the mappings in the property $other to $self.  Non-null
7855         # mappings from $other override those in $self.  It essentially merges
7856         # the two properties, with the second having priority except for null
7857         # mappings.
7858
7859         my $self = shift;
7860         my $other = shift;
7861         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7862
7863         if (! $other->isa(__PACKAGE__)) {
7864             Carp::my_carp_bug("$other should be a "
7865                             . __PACKAGE__
7866                             . ".  Not a '"
7867                             . ref($other)
7868                             . "'.  Not added;");
7869             return;
7870         }
7871
7872         no overloading;
7873         return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
7874     }
7875
7876     sub set_proxy_for {
7877         # Certain tables are not generally written out to files, but
7878         # Unicode::UCD has the intelligence to know that the file for $self
7879         # can be used to reconstruct those tables.  This routine just changes
7880         # things so that UCD pod entries for those suppressed tables are
7881         # generated, so the fact that a proxy is used is invisible to the
7882         # user.
7883
7884         my $self = shift;
7885
7886         foreach my $property_name (@_) {
7887             my $ref = property_ref($property_name);
7888             next if $ref->to_output_map;
7889             $ref->set_fate($MAP_PROXIED);
7890         }
7891     }
7892
7893     sub set_type {
7894         # Set the type of the property.  Mostly this is figured out by the
7895         # data in the table.  But this is used to set it explicitly.  The
7896         # reason it is not a standard accessor is that when setting a binary
7897         # property, we need to make sure that all the true/false aliases are
7898         # present, as they were omitted in early Unicode releases.
7899
7900         my $self = shift;
7901         my $type = shift;
7902         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7903
7904         if ($type != $ENUM
7905             && $type != $BINARY
7906             && $type != $FORCED_BINARY
7907             && $type != $STRING)
7908         {
7909             Carp::my_carp("Unrecognized type '$type'.  Type not set");
7910             return;
7911         }
7912
7913         { no overloading; $type{pack 'J', $self} = $type; }
7914         return if $type != $BINARY && $type != $FORCED_BINARY;
7915
7916         my $yes = $self->table('Y');
7917         $yes = $self->table('Yes') if ! defined $yes;
7918         $yes = $self->add_match_table('Y', Full_Name => 'Yes')
7919                                                             if ! defined $yes;
7920
7921         # Add aliases in order wanted, duplicates will be ignored.  We use a
7922         # binary property present in all releases for its ordered lists of
7923         # true/false aliases.  Note, that could run into problems in
7924         # outputting things in that we don't distinguish between the name and
7925         # full name of these.  Hopefully, if the table was already created
7926         # before this code is executed, it was done with these set properly.
7927         my $bm = property_ref("Bidi_Mirrored");
7928         foreach my $alias ($bm->table("Y")->aliases) {
7929             $yes->add_alias($alias->name);
7930         }
7931         my $no = $self->table('N');
7932         $no = $self->table('No') if ! defined $no;
7933         $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
7934         foreach my $alias ($bm->table("N")->aliases) {
7935             $no->add_alias($alias->name);
7936         }
7937
7938         return;
7939     }
7940
7941     sub add_map {
7942         # Add a map to the property's map table.  This also keeps
7943         # track of the maps so that the property type can be determined from
7944         # its data.
7945
7946         my $self = shift;
7947         my $start = shift;  # First code point in range
7948         my $end = shift;    # Final code point in range
7949         my $map = shift;    # What the range maps to.
7950         # Rest of parameters passed on.
7951
7952         my $addr = do { no overloading; pack 'J', $self; };
7953
7954         # If haven't the type of the property, gather information to figure it
7955         # out.
7956         if ($type{$addr} == $UNKNOWN) {
7957
7958             # If the map contains an interior blank or dash, or most other
7959             # nonword characters, it will be a string property.  This
7960             # heuristic may actually miss some string properties.  If so, they
7961             # may need to have explicit set_types called for them.  This
7962             # happens in the Unihan properties.
7963             if ($map =~ / (?<= . ) [ -] (?= . ) /x
7964                 || $map =~ / [^\w.\/\ -]  /x)
7965             {
7966                 $self->set_type($STRING);
7967
7968                 # $unique_maps is used for disambiguating between ENUM and
7969                 # BINARY later; since we know the property is not going to be
7970                 # one of those, no point in keeping the data around
7971                 undef $unique_maps{$addr};
7972             }
7973             else {
7974
7975                 # Not necessarily a string.  The final decision has to be
7976                 # deferred until all the data are in.  We keep track of if all
7977                 # the values are code points for that eventual decision.
7978                 $has_only_code_point_maps{$addr} &=
7979                                             $map =~ / ^ $code_point_re $/x;
7980
7981                 # For the purposes of disambiguating between binary and other
7982                 # enumerations at the end, we keep track of the first three
7983                 # distinct property values.  Once we get to three, we know
7984                 # it's not going to be binary, so no need to track more.
7985                 if (scalar keys %{$unique_maps{$addr}} < 3) {
7986                     $unique_maps{$addr}{main::standardize($map)} = 1;
7987                 }
7988             }
7989         }
7990
7991         # Add the mapping by calling our map table's method
7992         return $map{$addr}->add_map($start, $end, $map, @_);
7993     }
7994
7995     sub compute_type {
7996         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
7997         # should be called after the property is mostly filled with its maps.
7998         # We have been keeping track of what the property values have been,
7999         # and now have the necessary information to figure out the type.
8000
8001         my $self = shift;
8002         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8003
8004         my $addr = do { no overloading; pack 'J', $self; };
8005
8006         my $type = $type{$addr};
8007
8008         # If already have figured these out, no need to do so again, but we do
8009         # a double check on ENUMS to make sure that a string property hasn't
8010         # improperly been classified as an ENUM, so continue on with those.
8011         return if $type == $STRING
8012                   || $type == $BINARY
8013                   || $type == $FORCED_BINARY;
8014
8015         # If every map is to a code point, is a string property.
8016         if ($type == $UNKNOWN
8017             && ($has_only_code_point_maps{$addr}
8018                 || (defined $map{$addr}->default_map
8019                     && $map{$addr}->default_map eq "")))
8020         {
8021             $self->set_type($STRING);
8022         }
8023         else {
8024
8025             # Otherwise, it is to some sort of enumeration.  (The case where
8026             # it is a Unicode miscellaneous property, and treated like a
8027             # string in this program is handled in add_map()).  Distinguish
8028             # between binary and some other enumeration type.  Of course, if
8029             # there are more than two values, it's not binary.  But more
8030             # subtle is the test that the default mapping is defined means it
8031             # isn't binary.  This in fact may change in the future if Unicode
8032             # changes the way its data is structured.  But so far, no binary
8033             # properties ever have @missing lines for them, so the default map
8034             # isn't defined for them.  The few properties that are two-valued
8035             # and aren't considered binary have the default map defined
8036             # starting in Unicode 5.0, when the @missing lines appeared; and
8037             # this program has special code to put in a default map for them
8038             # for earlier than 5.0 releases.
8039             if ($type == $ENUM
8040                 || scalar keys %{$unique_maps{$addr}} > 2
8041                 || defined $self->default_map)
8042             {
8043                 my $tables = $self->tables;
8044                 my $count = $self->count;
8045                 if ($verbosity && $count > 500 && $tables/$count > .1) {
8046                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
8047                 }
8048                 $self->set_type($ENUM);
8049             }
8050             else {
8051                 $self->set_type($BINARY);
8052             }
8053         }
8054         undef $unique_maps{$addr};  # Garbage collect
8055         return;
8056     }
8057
8058     sub set_fate {
8059         my $self = shift;
8060         my $fate = shift;
8061         my $reason = shift;  # Ignored unless suppressing
8062         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8063
8064         my $addr = do { no overloading; pack 'J', $self; };
8065         if ($fate == $SUPPRESSED) {
8066             $why_suppressed{$self->complete_name} = $reason;
8067         }
8068
8069         # Each table shares the property's fate, except that MAP_PROXIED
8070         # doesn't affect match tables
8071         $map{$addr}->set_fate($fate, $reason);
8072         if ($fate != $MAP_PROXIED) {
8073             foreach my $table ($map{$addr}, $self->tables) {
8074                 $table->set_fate($fate, $reason);
8075             }
8076         }
8077         return;
8078     }
8079
8080
8081     # Most of the accessors for a property actually apply to its map table.
8082     # Setup up accessor functions for those, referring to %map
8083     for my $sub (qw(
8084                     add_alias
8085                     add_anomalous_entry
8086                     add_comment
8087                     add_conflicting
8088                     add_description
8089                     add_duplicate
8090                     add_note
8091                     aliases
8092                     comment
8093                     complete_name
8094                     containing_range
8095                     count
8096                     default_map
8097                     delete_range
8098                     description
8099                     each_range
8100                     external_name
8101                     fate
8102                     file_path
8103                     format
8104                     initialize
8105                     inverse_list
8106                     is_empty
8107                     name
8108                     note
8109                     perl_extension
8110                     property
8111                     range_count
8112                     ranges
8113                     range_size_1
8114                     reset_each_range
8115                     set_comment
8116                     set_default_map
8117                     set_file_path
8118                     set_final_comment
8119                     _set_format
8120                     set_range_size_1
8121                     set_status
8122                     set_to_output_map
8123                     short_name
8124                     status
8125                     status_info
8126                     to_output_map
8127                     type_of
8128                     value_of
8129                     write
8130                 ))
8131                     # 'property' above is for symmetry, so that one can take
8132                     # the property of a property and get itself, and so don't
8133                     # have to distinguish between properties and tables in
8134                     # calling code
8135     {
8136         no strict "refs";
8137         *$sub = sub {
8138             use strict "refs";
8139             my $self = shift;
8140             no overloading;
8141             return $map{pack 'J', $self}->$sub(@_);
8142         }
8143     }
8144
8145
8146 } # End closure
8147
8148 package main;
8149
8150 sub join_lines($) {
8151     # Returns lines of the input joined together, so that they can be folded
8152     # properly.
8153     # This causes continuation lines to be joined together into one long line
8154     # for folding.  A continuation line is any line that doesn't begin with a
8155     # space or "\b" (the latter is stripped from the output).  This is so
8156     # lines can be be in a HERE document so as to fit nicely in the terminal
8157     # width, but be joined together in one long line, and then folded with
8158     # indents, '#' prefixes, etc, properly handled.
8159     # A blank separates the joined lines except if there is a break; an extra
8160     # blank is inserted after a period ending a line.
8161
8162     # Initialize the return with the first line.
8163     my ($return, @lines) = split "\n", shift;
8164
8165     # If the first line is null, it was an empty line, add the \n back in
8166     $return = "\n" if $return eq "";
8167
8168     # Now join the remainder of the physical lines.
8169     for my $line (@lines) {
8170
8171         # An empty line means wanted a blank line, so add two \n's to get that
8172         # effect, and go to the next line.
8173         if (length $line == 0) {
8174             $return .= "\n\n";
8175             next;
8176         }
8177
8178         # Look at the last character of what we have so far.
8179         my $previous_char = substr($return, -1, 1);
8180
8181         # And at the next char to be output.
8182         my $next_char = substr($line, 0, 1);
8183
8184         if ($previous_char ne "\n") {
8185
8186             # Here didn't end wth a nl.  If the next char a blank or \b, it
8187             # means that here there is a break anyway.  So add a nl to the
8188             # output.
8189             if ($next_char eq " " || $next_char eq "\b") {
8190                 $previous_char = "\n";
8191                 $return .= $previous_char;
8192             }
8193
8194             # Add an extra space after periods.
8195             $return .= " " if $previous_char eq '.';
8196         }
8197
8198         # Here $previous_char is still the latest character to be output.  If
8199         # it isn't a nl, it means that the next line is to be a continuation
8200         # line, with a blank inserted between them.
8201         $return .= " " if $previous_char ne "\n";
8202
8203         # Get rid of any \b
8204         substr($line, 0, 1) = "" if $next_char eq "\b";
8205
8206         # And append this next line.
8207         $return .= $line;
8208     }
8209
8210     return $return;
8211 }
8212
8213 sub simple_fold($;$$$) {
8214     # Returns a string of the input (string or an array of strings) folded
8215     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8216     # a \n
8217     # This is tailored for the kind of text written by this program,
8218     # especially the pod file, which can have very long names with
8219     # underscores in the middle, or words like AbcDefgHij....  We allow
8220     # breaking in the middle of such constructs if the line won't fit
8221     # otherwise.  The break in such cases will come either just after an
8222     # underscore, or just before one of the Capital letters.
8223
8224     local $to_trace = 0 if main::DEBUG;
8225
8226     my $line = shift;
8227     my $prefix = shift;     # Optional string to prepend to each output
8228                             # line
8229     $prefix = "" unless defined $prefix;
8230
8231     my $hanging_indent = shift; # Optional number of spaces to indent
8232                                 # continuation lines
8233     $hanging_indent = 0 unless $hanging_indent;
8234
8235     my $right_margin = shift;   # Optional number of spaces to narrow the
8236                                 # total width by.
8237     $right_margin = 0 unless defined $right_margin;
8238
8239     # Call carp with the 'nofold' option to avoid it from trying to call us
8240     # recursively
8241     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8242
8243     # The space available doesn't include what's automatically prepended
8244     # to each line, or what's reserved on the right.
8245     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8246     # XXX Instead of using the 'nofold' perhaps better to look up the stack
8247
8248     if (DEBUG && $hanging_indent >= $max) {
8249         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
8250         $hanging_indent = 0;
8251     }
8252
8253     # First, split into the current physical lines.
8254     my @line;
8255     if (ref $line) {        # Better be an array, because not bothering to
8256                             # test
8257         foreach my $line (@{$line}) {
8258             push @line, split /\n/, $line;
8259         }
8260     }
8261     else {
8262         @line = split /\n/, $line;
8263     }
8264
8265     #local $to_trace = 1 if main::DEBUG;
8266     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8267
8268     # Look at each current physical line.
8269     for (my $i = 0; $i < @line; $i++) {
8270         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8271         #local $to_trace = 1 if main::DEBUG;
8272         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8273
8274         # Remove prefix, because will be added back anyway, don't want
8275         # doubled prefix
8276         $line[$i] =~ s/^$prefix//;
8277
8278         # Remove trailing space
8279         $line[$i] =~ s/\s+\Z//;
8280
8281         # If the line is too long, fold it.
8282         if (length $line[$i] > $max) {
8283             my $remainder;
8284
8285             # Here needs to fold.  Save the leading space in the line for
8286             # later.
8287             $line[$i] =~ /^ ( \s* )/x;
8288             my $leading_space = $1;
8289             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8290
8291             # If character at final permissible position is white space,
8292             # fold there, which will delete that white space
8293             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8294                 $remainder = substr($line[$i], $max);
8295                 $line[$i] = substr($line[$i], 0, $max - 1);
8296             }
8297             else {
8298
8299                 # Otherwise fold at an acceptable break char closest to
8300                 # the max length.  Look at just the maximal initial
8301                 # segment of the line
8302                 my $segment = substr($line[$i], 0, $max - 1);
8303                 if ($segment =~
8304                     /^ ( .{$hanging_indent}   # Don't look before the
8305                                               #  indent.
8306                         \ *                   # Don't look in leading
8307                                               #  blanks past the indent
8308                             [^ ] .*           # Find the right-most
8309                         (?:                   #  acceptable break:
8310                             [ \s = ]          # space or equal
8311                             | - (?! [.0-9] )  # or non-unary minus.
8312                         )                     # $1 includes the character
8313                     )/x)
8314                 {
8315                     # Split into the initial part that fits, and remaining
8316                     # part of the input
8317                     $remainder = substr($line[$i], length $1);
8318                     $line[$i] = $1;
8319                     trace $line[$i] if DEBUG && $to_trace;
8320                     trace $remainder if DEBUG && $to_trace;
8321                 }
8322
8323                 # If didn't find a good breaking spot, see if there is a
8324                 # not-so-good breaking spot.  These are just after
8325                 # underscores or where the case changes from lower to
8326                 # upper.  Use \a as a soft hyphen, but give up
8327                 # and don't break the line if there is actually a \a
8328                 # already in the input.  We use an ascii character for the
8329                 # soft-hyphen to avoid any attempt by miniperl to try to
8330                 # access the files that this program is creating.
8331                 elsif ($segment !~ /\a/
8332                        && ($segment =~ s/_/_\a/g
8333                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8334                 {
8335                     # Here were able to find at least one place to insert
8336                     # our substitute soft hyphen.  Find the right-most one
8337                     # and replace it by a real hyphen.
8338                     trace $segment if DEBUG && $to_trace;
8339                     substr($segment,
8340                             rindex($segment, "\a"),
8341                             1) = '-';
8342
8343                     # Then remove the soft hyphen substitutes.
8344                     $segment =~ s/\a//g;
8345                     trace $segment if DEBUG && $to_trace;
8346
8347                     # And split into the initial part that fits, and
8348                     # remainder of the line
8349                     my $pos = rindex($segment, '-');
8350                     $remainder = substr($line[$i], $pos);
8351                     trace $remainder if DEBUG && $to_trace;
8352                     $line[$i] = substr($segment, 0, $pos + 1);
8353                 }
8354             }
8355
8356             # Here we know if we can fold or not.  If we can, $remainder
8357             # is what remains to be processed in the next iteration.
8358             if (defined $remainder) {
8359                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8360
8361                 # Insert the folded remainder of the line as a new element
8362                 # of the array.  (It may still be too long, but we will
8363                 # deal with that next time through the loop.)  Omit any
8364                 # leading space in the remainder.
8365                 $remainder =~ s/^\s+//;
8366                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8367
8368                 # But then indent by whichever is larger of:
8369                 # 1) the leading space on the input line;
8370                 # 2) the hanging indent.
8371                 # This preserves indentation in the original line.
8372                 my $lead = ($leading_space)
8373                             ? length $leading_space
8374                             : $hanging_indent;
8375                 $lead = max($lead, $hanging_indent);
8376                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8377             }
8378         }
8379
8380         # Ready to output the line. Get rid of any trailing space
8381         # And prefix by the required $prefix passed in.
8382         $line[$i] =~ s/\s+$//;
8383         $line[$i] = "$prefix$line[$i]\n";
8384     } # End of looping through all the lines.
8385
8386     return join "", @line;
8387 }
8388
8389 sub property_ref {  # Returns a reference to a property object.
8390     return Property::property_ref(@_);
8391 }
8392
8393 sub force_unlink ($) {
8394     my $filename = shift;
8395     return unless file_exists($filename);
8396     return if CORE::unlink($filename);
8397
8398     # We might need write permission
8399     chmod 0777, $filename;
8400     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
8401     return;
8402 }
8403
8404 sub write ($$@) {
8405     # Given a filename and references to arrays of lines, write the lines of
8406     # each array to the file
8407     # Filename can be given as an arrayref of directory names
8408
8409     return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
8410
8411     my $file  = shift;
8412     my $use_utf8 = shift;
8413
8414     # Get into a single string if an array, and get rid of, in Unix terms, any
8415     # leading '.'
8416     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8417     $file = File::Spec->canonpath($file);
8418
8419     # If has directories, make sure that they all exist
8420     (undef, my $directories, undef) = File::Spec->splitpath($file);
8421     File::Path::mkpath($directories) if $directories && ! -d $directories;
8422
8423     push @files_actually_output, $file;
8424
8425     force_unlink ($file);
8426
8427     my $OUT;
8428     if (not open $OUT, ">", $file) {
8429         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
8430         return;
8431     }
8432
8433     binmode $OUT, ":utf8" if $use_utf8;
8434
8435     while (defined (my $lines_ref = shift)) {
8436         unless (@$lines_ref) {
8437             Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8438         }
8439
8440         print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8441     }
8442     close $OUT or die Carp::my_carp("close '$file' failed: $!");
8443
8444     print "$file written.\n" if $verbosity >= $VERBOSE;
8445
8446     return;
8447 }
8448
8449
8450 sub Standardize($) {
8451     # This converts the input name string into a standardized equivalent to
8452     # use internally.
8453
8454     my $name = shift;
8455     unless (defined $name) {
8456       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
8457       return;
8458     }
8459
8460     # Remove any leading or trailing white space
8461     $name =~ s/^\s+//g;
8462     $name =~ s/\s+$//g;
8463
8464     # Convert interior white space and hyphens into underscores.
8465     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8466
8467     # Capitalize the letter following an underscore, and convert a sequence of
8468     # multiple underscores to a single one
8469     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8470
8471     # And capitalize the first letter, but not for the special cjk ones.
8472     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8473     return $name;
8474 }
8475
8476 sub standardize ($) {
8477     # Returns a lower-cased standardized name, without underscores.  This form
8478     # is chosen so that it can distinguish between any real versus superficial
8479     # Unicode name differences.  It relies on the fact that Unicode doesn't
8480     # have interior underscores, white space, nor dashes in any
8481     # stricter-matched name.  It should not be used on Unicode code point
8482     # names (the Name property), as they mostly, but not always follow these
8483     # rules.
8484
8485     my $name = Standardize(shift);
8486     return if !defined $name;
8487
8488     $name =~ s/ (?<= .) _ (?= . ) //xg;
8489     return lc $name;
8490 }
8491
8492 sub utf8_heavy_name ($$) {
8493     # Returns the name that utf8_heavy.pl will use to find a table.  XXX
8494     # perhaps this function should be placed somewhere, like Heavy.pl so that
8495     # utf8_heavy can use it directly without duplicating code that can get
8496     # out-of sync.
8497
8498     my $table = shift;
8499     my $alias = shift;
8500     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8501
8502     my $property = $table->property;
8503     $property = ($property == $perl)
8504                 ? ""                # 'perl' is never explicitly stated
8505                 : standardize($property->name) . '=';
8506     if ($alias->loose_match) {
8507         return $property . standardize($alias->name);
8508     }
8509     else {
8510         return lc ($property . $alias->name);
8511     }
8512
8513     return;
8514 }
8515
8516 {   # Closure
8517
8518     my $indent_increment = " " x (($debugging_build) ? 2 : 0);
8519     my %already_output;
8520
8521     $main::simple_dumper_nesting = 0;
8522
8523     sub simple_dumper {
8524         # Like Simple Data::Dumper. Good enough for our needs. We can't use
8525         # the real thing as we have to run under miniperl.
8526
8527         # It is designed so that on input it is at the beginning of a line,
8528         # and the final thing output in any call is a trailing ",\n".
8529
8530         my $item = shift;
8531         my $indent = shift;
8532         $indent = "" if ! $debugging_build || ! defined $indent;
8533
8534         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8535
8536         # nesting level is localized, so that as the call stack pops, it goes
8537         # back to the prior value.
8538         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8539         undef %already_output if $main::simple_dumper_nesting == 0;
8540         $main::simple_dumper_nesting++;
8541         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8542
8543         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8544
8545         # Determine the indent for recursive calls.
8546         my $next_indent = $indent . $indent_increment;
8547
8548         my $output;
8549         if (! ref $item) {
8550
8551             # Dump of scalar: just output it in quotes if not a number.  To do
8552             # so we must escape certain characters, and therefore need to
8553             # operate on a copy to avoid changing the original
8554             my $copy = $item;
8555             $copy = $UNDEF unless defined $copy;
8556
8557             # Quote non-integers (integers also have optional leading '-')
8558             if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
8559
8560                 # Escape apostrophe and backslash
8561                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8562                 $copy = "'$copy'";
8563             }
8564             $output = "$indent$copy,\n";
8565         }
8566         else {
8567
8568             # Keep track of cycles in the input, and refuse to infinitely loop
8569             my $addr = do { no overloading; pack 'J', $item; };
8570             if (defined $already_output{$addr}) {
8571                 return "${indent}ALREADY OUTPUT: $item\n";
8572             }
8573             $already_output{$addr} = $item;
8574
8575             if (ref $item eq 'ARRAY') {
8576                 my $using_brackets;
8577                 $output = $indent;
8578                 if ($main::simple_dumper_nesting > 1) {
8579                     $output .= '[';
8580                     $using_brackets = 1;
8581                 }
8582                 else {
8583                     $using_brackets = 0;
8584                 }
8585
8586                 # If the array is empty, put the closing bracket on the same
8587                 # line.  Otherwise, recursively add each array element
8588                 if (@$item == 0) {
8589                     $output .= " ";
8590                 }
8591                 else {
8592                     $output .= "\n";
8593                     for (my $i = 0; $i < @$item; $i++) {
8594
8595                         # Indent array elements one level
8596                         $output .= &simple_dumper($item->[$i], $next_indent);
8597                         next if ! $debugging_build;
8598                         $output =~ s/\n$//;      # Remove any trailing nl so
8599                         $output .= " # [$i]\n";  # as to add a comment giving
8600                                                  # the array index
8601                     }
8602                     $output .= $indent;     # Indent closing ']' to orig level
8603                 }
8604                 $output .= ']' if $using_brackets;
8605                 $output .= ",\n";
8606             }
8607             elsif (ref $item eq 'HASH') {
8608                 my $is_first_line;
8609                 my $using_braces;
8610                 my $body_indent;
8611
8612                 # No surrounding braces at top level
8613                 $output .= $indent;
8614                 if ($main::simple_dumper_nesting > 1) {
8615                     $output .= "{\n";
8616                     $is_first_line = 0;
8617                     $body_indent = $next_indent;
8618                     $next_indent .= $indent_increment;
8619                     $using_braces = 1;
8620                 }
8621                 else {
8622                     $is_first_line = 1;
8623                     $body_indent = $indent;
8624                     $using_braces = 0;
8625                 }
8626
8627                 # Output hashes sorted alphabetically instead of apparently
8628                 # random.  Use caseless alphabetic sort
8629                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8630                 {
8631                     if ($is_first_line) {
8632                         $is_first_line = 0;
8633                     }
8634                     else {
8635                         $output .= "$body_indent";
8636                     }
8637
8638                     # The key must be a scalar, but this recursive call quotes
8639                     # it
8640                     $output .= &simple_dumper($key);
8641
8642                     # And change the trailing comma and nl to the hash fat
8643                     # comma for clarity, and so the value can be on the same
8644                     # line
8645                     $output =~ s/,\n$/ => /;
8646
8647                     # Recursively call to get the value's dump.
8648                     my $next = &simple_dumper($item->{$key}, $next_indent);
8649
8650                     # If the value is all on one line, remove its indent, so
8651                     # will follow the => immediately.  If it takes more than
8652                     # one line, start it on a new line.
8653                     if ($next !~ /\n.*\n/) {
8654                         $next =~ s/^ *//;
8655                     }
8656                     else {
8657                         $output .= "\n";
8658                     }
8659                     $output .= $next;
8660                 }
8661
8662                 $output .= "$indent},\n" if $using_braces;
8663             }
8664             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8665                 $output = $indent . ref($item) . "\n";
8666                 # XXX see if blessed
8667             }
8668             elsif ($item->can('dump')) {
8669
8670                 # By convention in this program, objects furnish a 'dump'
8671                 # method.  Since not doing any output at this level, just pass
8672                 # on the input indent
8673                 $output = $item->dump($indent);
8674             }
8675             else {
8676                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
8677             }
8678         }
8679         return $output;
8680     }
8681 }
8682
8683 sub dump_inside_out {
8684     # Dump inside-out hashes in an object's state by converting them to a
8685     # regular hash and then calling simple_dumper on that.
8686
8687     my $object = shift;
8688     my $fields_ref = shift;
8689     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8690
8691     my $addr = do { no overloading; pack 'J', $object; };
8692
8693     my %hash;
8694     foreach my $key (keys %$fields_ref) {
8695         $hash{$key} = $fields_ref->{$key}{$addr};
8696     }
8697
8698     return simple_dumper(\%hash, @_);
8699 }
8700
8701 sub _operator_dot {
8702     # Overloaded '.' method that is common to all packages.  It uses the
8703     # package's stringify method.
8704
8705     my $self = shift;
8706     my $other = shift;
8707     my $reversed = shift;
8708     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8709
8710     $other = "" unless defined $other;
8711
8712     foreach my $which (\$self, \$other) {
8713         next unless ref $$which;
8714         if ($$which->can('_operator_stringify')) {
8715             $$which = $$which->_operator_stringify;
8716         }
8717         else {
8718             my $ref = ref $$which;
8719             my $addr = do { no overloading; pack 'J', $$which; };
8720             $$which = "$ref ($addr)";
8721         }
8722     }
8723     return ($reversed)
8724             ? "$other$self"
8725             : "$self$other";
8726 }
8727
8728 sub _operator_equal {
8729     # Generic overloaded '==' routine.  To be equal, they must be the exact
8730     # same object
8731
8732     my $self = shift;
8733     my $other = shift;
8734
8735     return 0 unless defined $other;
8736     return 0 unless ref $other;
8737     no overloading;
8738     return $self == $other;
8739 }
8740
8741 sub _operator_not_equal {
8742     my $self = shift;
8743     my $other = shift;
8744
8745     return ! _operator_equal($self, $other);
8746 }
8747
8748 sub process_PropertyAliases($) {
8749     # This reads in the PropertyAliases.txt file, which contains almost all
8750     # the character properties in Unicode and their equivalent aliases:
8751     # scf       ; Simple_Case_Folding         ; sfc
8752     #
8753     # Field 0 is the preferred short name for the property.
8754     # Field 1 is the full name.
8755     # Any succeeding ones are other accepted names.
8756
8757     my $file= shift;
8758     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8759
8760     # This whole file was non-existent in early releases, so use our own
8761     # internal one.
8762     $file->insert_lines(get_old_property_aliases())
8763                                                 if ! -e 'PropertyAliases.txt';
8764
8765     # Add any cjk properties that may have been defined.
8766     $file->insert_lines(@cjk_properties);
8767
8768     while ($file->next_line) {
8769
8770         my @data = split /\s*;\s*/;
8771
8772         my $full = $data[1];
8773
8774         my $this = Property->new($data[0], Full_Name => $full);
8775
8776         # Start looking for more aliases after these two.
8777         for my $i (2 .. @data - 1) {
8778             $this->add_alias($data[$i]);
8779         }
8780
8781     }
8782     return;
8783 }
8784
8785 sub finish_property_setup {
8786     # Finishes setting up after PropertyAliases.
8787
8788     my $file = shift;
8789     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8790
8791     # This entry was missing from this file in earlier Unicode versions
8792     if (-e 'Jamo.txt') {
8793         my $jsn = property_ref('JSN');
8794         if (! defined $jsn) {
8795             $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8796         }
8797     }
8798
8799     # These are used so much, that we set globals for them.
8800     $gc = property_ref('General_Category');
8801     $block = property_ref('Block');
8802     $script = property_ref('Script');
8803
8804     # Perl adds this alias.
8805     $gc->add_alias('Category');
8806
8807     # Unicode::Normalize expects this file with this name and directory.
8808     my $ccc = property_ref('Canonical_Combining_Class');
8809     if (defined $ccc) {
8810         $ccc->set_file('CombiningClass');
8811         $ccc->set_directory(File::Spec->curdir());
8812     }
8813
8814     # These two properties aren't actually used in the core, but unfortunately
8815     # the names just above that are in the core interfere with these, so
8816     # choose different names.  These aren't a problem unless the map tables
8817     # for these files get written out.
8818     my $lowercase = property_ref('Lowercase');
8819     $lowercase->set_file('IsLower') if defined $lowercase;
8820     my $uppercase = property_ref('Uppercase');
8821     $uppercase->set_file('IsUpper') if defined $uppercase;
8822
8823     # Set up the hard-coded default mappings, but only on properties defined
8824     # for this release
8825     foreach my $property (keys %default_mapping) {
8826         my $property_object = property_ref($property);
8827         next if ! defined $property_object;
8828         my $default_map = $default_mapping{$property};
8829         $property_object->set_default_map($default_map);
8830
8831         # A map of <code point> implies the property is string.
8832         if ($property_object->type == $UNKNOWN
8833             && $default_map eq $CODE_POINT)
8834         {
8835             $property_object->set_type($STRING);
8836         }
8837     }
8838
8839     # The following use the Multi_Default class to create objects for
8840     # defaults.
8841
8842     # Bidi class has a complicated default, but the derived file takes care of
8843     # the complications, leaving just 'L'.
8844     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8845         property_ref('Bidi_Class')->set_default_map('L');
8846     }
8847     else {
8848         my $default;
8849
8850         # The derived file was introduced in 3.1.1.  The values below are
8851         # taken from table 3-8, TUS 3.0
8852         my $default_R =
8853             'my $default = Range_List->new;
8854              $default->add_range(0x0590, 0x05FF);
8855              $default->add_range(0xFB1D, 0xFB4F);'
8856         ;
8857
8858         # The defaults apply only to unassigned characters
8859         $default_R .= '$gc->table("Unassigned") & $default;';
8860
8861         if ($v_version lt v3.0.0) {
8862             $default = Multi_Default->new(R => $default_R, 'L');
8863         }
8864         else {
8865
8866             # AL apparently not introduced until 3.0:  TUS 2.x references are
8867             # not on-line to check it out
8868             my $default_AL =
8869                 'my $default = Range_List->new;
8870                  $default->add_range(0x0600, 0x07BF);
8871                  $default->add_range(0xFB50, 0xFDFF);
8872                  $default->add_range(0xFE70, 0xFEFF);'
8873             ;
8874
8875             # Non-character code points introduced in this release; aren't AL
8876             if ($v_version ge 3.1.0) {
8877                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8878             }
8879             $default_AL .= '$gc->table("Unassigned") & $default';
8880             $default = Multi_Default->new(AL => $default_AL,
8881                                           R => $default_R,
8882                                           'L');
8883         }
8884         property_ref('Bidi_Class')->set_default_map($default);
8885     }
8886
8887     # Joining type has a complicated default, but the derived file takes care
8888     # of the complications, leaving just 'U' (or Non_Joining), except the file
8889     # is bad in 3.1.0
8890     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8891         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8892             property_ref('Joining_Type')->set_default_map('Non_Joining');
8893         }
8894         else {
8895
8896             # Otherwise, there are not one, but two possibilities for the
8897             # missing defaults: T and U.
8898             # The missing defaults that evaluate to T are given by:
8899             # T = Mn + Cf - ZWNJ - ZWJ
8900             # where Mn and Cf are the general category values. In other words,
8901             # any non-spacing mark or any format control character, except
8902             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8903             # WIDTH JOINER (joining type C).
8904             my $default = Multi_Default->new(
8905                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8906                'Non_Joining');
8907             property_ref('Joining_Type')->set_default_map($default);
8908         }
8909     }
8910
8911     # Line break has a complicated default in early releases. It is 'Unknown'
8912     # for non-assigned code points; 'AL' for assigned.
8913     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8914         my $lb = property_ref('Line_Break');
8915         if ($v_version gt 3.2.0) {
8916             $lb->set_default_map('Unknown');
8917         }
8918         else {
8919             my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8920                                               'AL');
8921             $lb->set_default_map($default);
8922         }
8923
8924         # If has the URS property, make sure that the standard aliases are in
8925         # it, since not in the input tables in some versions.
8926         my $urs = property_ref('Unicode_Radical_Stroke');
8927         if (defined $urs) {
8928             $urs->add_alias('cjkRSUnicode');
8929             $urs->add_alias('kRSUnicode');
8930         }
8931     }
8932
8933     # For backwards compatibility with applications that may read the mapping
8934     # file directly (it was documented in 5.12 and 5.14 as being thusly
8935     # usable), keep it from being adjusted.  (range_size_1 is
8936     # used to force the traditional format.)
8937     if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
8938         $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
8939         $nfkc_cf->set_range_size_1(1);
8940     }
8941     if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
8942         $bmg->set_to_output_map($EXTERNAL_MAP);
8943         $bmg->set_range_size_1(1);
8944     }
8945
8946     property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
8947
8948     return;
8949 }
8950
8951 sub get_old_property_aliases() {
8952     # Returns what would be in PropertyAliases.txt if it existed in very old
8953     # versions of Unicode.  It was derived from the one in 3.2, and pared
8954     # down based on the data that was actually in the older releases.
8955     # An attempt was made to use the existence of files to mean inclusion or
8956     # not of various aliases, but if this was not sufficient, using version
8957     # numbers was resorted to.
8958
8959     my @return;
8960
8961     # These are to be used in all versions (though some are constructed by
8962     # this program if missing)
8963     push @return, split /\n/, <<'END';
8964 bc        ; Bidi_Class
8965 Bidi_M    ; Bidi_Mirrored
8966 cf        ; Case_Folding
8967 ccc       ; Canonical_Combining_Class
8968 dm        ; Decomposition_Mapping
8969 dt        ; Decomposition_Type
8970 gc        ; General_Category
8971 isc       ; ISO_Comment
8972 lc        ; Lowercase_Mapping
8973 na        ; Name
8974 na1       ; Unicode_1_Name
8975 nt        ; Numeric_Type
8976 nv        ; Numeric_Value
8977 sfc       ; Simple_Case_Folding
8978 slc       ; Simple_Lowercase_Mapping
8979 stc       ; Simple_Titlecase_Mapping
8980 suc       ; Simple_Uppercase_Mapping
8981 tc        ; Titlecase_Mapping
8982 uc        ; Uppercase_Mapping
8983 END
8984
8985     if (-e 'Blocks.txt') {
8986         push @return, "blk       ; Block\n";
8987     }
8988     if (-e 'ArabicShaping.txt') {
8989         push @return, split /\n/, <<'END';
8990 jg        ; Joining_Group
8991 jt        ; Joining_Type
8992 END
8993     }
8994     if (-e 'PropList.txt') {
8995
8996         # This first set is in the original old-style proplist.
8997         push @return, split /\n/, <<'END';
8998 Alpha     ; Alphabetic
8999 Bidi_C    ; Bidi_Control
9000 Dash      ; Dash
9001 Dia       ; Diacritic
9002 Ext       ; Extender
9003 Hex       ; Hex_Digit
9004 Hyphen    ; Hyphen
9005 IDC       ; ID_Continue
9006 Ideo      ; Ideographic
9007 Join_C    ; Join_Control
9008 Math      ; Math
9009 QMark     ; Quotation_Mark
9010 Term      ; Terminal_Punctuation
9011 WSpace    ; White_Space
9012 END
9013         # The next sets were added later
9014         if ($v_version ge v3.0.0) {
9015             push @return, split /\n/, <<'END';
9016 Upper     ; Uppercase
9017 Lower     ; Lowercase
9018 END
9019         }
9020         if ($v_version ge v3.0.1) {
9021             push @return, split /\n/, <<'END';
9022 NChar     ; Noncharacter_Code_Point
9023 END
9024         }
9025         # The next sets were added in the new-style
9026         if ($v_version ge v3.1.0) {
9027             push @return, split /\n/, <<'END';
9028 OAlpha    ; Other_Alphabetic
9029 OLower    ; Other_Lowercase
9030 OMath     ; Other_Math
9031 OUpper    ; Other_Uppercase
9032 END
9033         }
9034         if ($v_version ge v3.1.1) {
9035             push @return, "AHex      ; ASCII_Hex_Digit\n";
9036         }
9037     }
9038     if (-e 'EastAsianWidth.txt') {
9039         push @return, "ea        ; East_Asian_Width\n";
9040     }
9041     if (-e 'CompositionExclusions.txt') {
9042         push @return, "CE        ; Composition_Exclusion\n";
9043     }
9044     if (-e 'LineBreak.txt') {
9045         push @return, "lb        ; Line_Break\n";
9046     }
9047     if (-e 'BidiMirroring.txt') {
9048         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
9049     }
9050     if (-e 'Scripts.txt') {
9051         push @return, "sc        ; Script\n";
9052     }
9053     if (-e 'DNormalizationProps.txt') {
9054         push @return, split /\n/, <<'END';
9055 Comp_Ex   ; Full_Composition_Exclusion
9056 FC_NFKC   ; FC_NFKC_Closure
9057 NFC_QC    ; NFC_Quick_Check
9058 NFD_QC    ; NFD_Quick_Check
9059 NFKC_QC   ; NFKC_Quick_Check
9060 NFKD_QC   ; NFKD_Quick_Check
9061 XO_NFC    ; Expands_On_NFC
9062 XO_NFD    ; Expands_On_NFD
9063 XO_NFKC   ; Expands_On_NFKC
9064 XO_NFKD   ; Expands_On_NFKD
9065 END
9066     }
9067     if (-e 'DCoreProperties.txt') {
9068         push @return, split /\n/, <<'END';
9069 IDS       ; ID_Start
9070 XIDC      ; XID_Continue
9071 XIDS      ; XID_Start
9072 END
9073         # These can also appear in some versions of PropList.txt
9074         push @return, "Lower     ; Lowercase\n"
9075                                     unless grep { $_ =~ /^Lower\b/} @return;
9076         push @return, "Upper     ; Uppercase\n"
9077                                     unless grep { $_ =~ /^Upper\b/} @return;
9078     }
9079
9080     # This flag requires the DAge.txt file to be copied into the directory.
9081     if (DEBUG && $compare_versions) {
9082         push @return, 'age       ; Age';
9083     }
9084
9085     return @return;
9086 }
9087
9088 sub process_PropValueAliases {
9089     # This file contains values that properties look like:
9090     # bc ; AL        ; Arabic_Letter
9091     # blk; n/a       ; Greek_And_Coptic                 ; Greek
9092     #
9093     # Field 0 is the property.
9094     # Field 1 is the short name of a property value or 'n/a' if no
9095     #                short name exists;
9096     # Field 2 is the full property value name;
9097     # Any other fields are more synonyms for the property value.
9098     # Purely numeric property values are omitted from the file; as are some
9099     # others, fewer and fewer in later releases
9100
9101     # Entries for the ccc property have an extra field before the
9102     # abbreviation:
9103     # ccc;   0; NR   ; Not_Reordered
9104     # It is the numeric value that the names are synonyms for.
9105
9106     # There are comment entries for values missing from this file:
9107     # # @missing: 0000..10FFFF; ISO_Comment; <none>
9108     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
9109
9110     my $file= shift;
9111     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9112
9113     # This whole file was non-existent in early releases, so use our own
9114     # internal one if necessary.
9115     if (! -e 'PropValueAliases.txt') {
9116         $file->insert_lines(get_old_property_value_aliases());
9117     }
9118
9119     # Add any explicit cjk values
9120     $file->insert_lines(@cjk_property_values);
9121
9122     # This line is used only for testing the code that checks for name
9123     # conflicts.  There is a script Inherited, and when this line is executed
9124     # it causes there to be a name conflict with the 'Inherited' that this
9125     # program generates for this block property value
9126     #$file->insert_lines('blk; n/a; Herited');
9127
9128
9129     # Process each line of the file ...
9130     while ($file->next_line) {
9131
9132         my ($property, @data) = split /\s*;\s*/;
9133
9134         # The ccc property has an extra field at the beginning, which is the
9135         # numeric value.  Move it to be after the other two, mnemonic, fields,
9136         # so that those will be used as the property value's names, and the
9137         # number will be an extra alias.  (Rightmost splice removes field 1-2,
9138         # returning them in a slice; left splice inserts that before anything,
9139         # thus shifting the former field 0 to after them.)
9140         splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
9141
9142         # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
9143         # there is no short name, use the full one in element 1
9144         if ($data[0] eq "n/a") {
9145             $data[0] = $data[1];
9146         }
9147         elsif ($data[0] ne $data[1]
9148                && standardize($data[0]) eq standardize($data[1])
9149                && $data[1] !~ /[[:upper:]]/)
9150         {
9151             # Also, there is a bug in the file in which "n/a" is omitted, and
9152             # the two fields are identical except for case, and the full name
9153             # is all lower case.  Copy the "short" name unto the full one to
9154             # give it some upper case.
9155
9156             $data[1] = $data[0];
9157         }
9158
9159         # Earlier releases had the pseudo property 'qc' that should expand to
9160         # the ones that replace it below.
9161         if ($property eq 'qc') {
9162             if (lc $data[0] eq 'y') {
9163                 $file->insert_lines('NFC_QC; Y      ; Yes',
9164                                     'NFD_QC; Y      ; Yes',
9165                                     'NFKC_QC; Y     ; Yes',
9166                                     'NFKD_QC; Y     ; Yes',
9167                                     );
9168             }
9169             elsif (lc $data[0] eq 'n') {
9170                 $file->insert_lines('NFC_QC; N      ; No',
9171                                     'NFD_QC; N      ; No',
9172                                     'NFKC_QC; N     ; No',
9173                                     'NFKD_QC; N     ; No',
9174                                     );
9175             }
9176             elsif (lc $data[0] eq 'm') {
9177                 $file->insert_lines('NFC_QC; M      ; Maybe',
9178                                     'NFKC_QC; M     ; Maybe',
9179                                     );
9180             }
9181             else {
9182                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
9183             }
9184             next;
9185         }
9186
9187         # The first field is the short name, 2nd is the full one.
9188         my $property_object = property_ref($property);
9189         my $table = $property_object->add_match_table($data[0],
9190                                                 Full_Name => $data[1]);
9191
9192         # Start looking for more aliases after these two.
9193         for my $i (2 .. @data - 1) {
9194             $table->add_alias($data[$i]);
9195         }
9196     } # End of looping through the file
9197
9198     # As noted in the comments early in the program, it generates tables for
9199     # the default values for all releases, even those for which the concept
9200     # didn't exist at the time.  Here we add those if missing.
9201     my $age = property_ref('age');
9202     if (defined $age && ! defined $age->table('Unassigned')) {
9203         $age->add_match_table('Unassigned');
9204     }
9205     $block->add_match_table('No_Block') if -e 'Blocks.txt'
9206                                     && ! defined $block->table('No_Block');
9207
9208
9209     # Now set the default mappings of the properties from the file.  This is
9210     # done after the loop because a number of properties have only @missings
9211     # entries in the file, and may not show up until the end.
9212     my @defaults = $file->get_missings;
9213     foreach my $default_ref (@defaults) {
9214         my $default = $default_ref->[0];
9215         my $property = property_ref($default_ref->[1]);
9216         $property->set_default_map($default);
9217     }
9218     return;
9219 }
9220
9221 sub get_old_property_value_aliases () {
9222     # Returns what would be in PropValueAliases.txt if it existed in very old
9223     # versions of Unicode.  It was derived from the one in 3.2, and pared
9224     # down.  An attempt was made to use the existence of files to mean
9225     # inclusion or not of various aliases, but if this was not sufficient,
9226     # using version numbers was resorted to.
9227
9228     my @return = split /\n/, <<'END';
9229 bc ; AN        ; Arabic_Number
9230 bc ; B         ; Paragraph_Separator
9231 bc ; CS        ; Common_Separator
9232 bc ; EN        ; European_Number
9233 bc ; ES        ; European_Separator
9234 bc ; ET        ; European_Terminator
9235 bc ; L         ; Left_To_Right
9236 bc ; ON        ; Other_Neutral
9237 bc ; R         ; Right_To_Left
9238 bc ; WS        ; White_Space
9239
9240 # The standard combining classes are very much different in v1, so only use
9241 # ones that look right (not checked thoroughly)
9242 ccc;   0; NR   ; Not_Reordered
9243 ccc;   1; OV   ; Overlay
9244 ccc;   7; NK   ; Nukta
9245 ccc;   8; KV   ; Kana_Voicing
9246 ccc;   9; VR   ; Virama
9247 ccc; 202; ATBL ; Attached_Below_Left
9248 ccc; 216; ATAR ; Attached_Above_Right
9249 ccc; 218; BL   ; Below_Left
9250 ccc; 220; B    ; Below
9251 ccc; 222; BR   ; Below_Right
9252 ccc; 224; L    ; Left
9253 ccc; 228; AL   ; Above_Left
9254 ccc; 230; A    ; Above
9255 ccc; 232; AR   ; Above_Right
9256 ccc; 234; DA   ; Double_Above
9257
9258 dt ; can       ; canonical
9259 dt ; enc       ; circle
9260 dt ; fin       ; final
9261 dt ; font      ; font
9262 dt ; fra       ; fraction
9263 dt ; init      ; initial
9264 dt ; iso       ; isolated
9265 dt ; med       ; medial
9266 dt ; n/a       ; none
9267 dt ; nb        ; noBreak
9268 dt ; sqr       ; square
9269 dt ; sub       ; sub
9270 dt ; sup       ; super
9271
9272 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
9273 gc ; Cc        ; Control
9274 gc ; Cn        ; Unassigned
9275 gc ; Co        ; Private_Use
9276 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
9277 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
9278 gc ; Ll        ; Lowercase_Letter
9279 gc ; Lm        ; Modifier_Letter
9280 gc ; Lo        ; Other_Letter
9281 gc ; Lu        ; Uppercase_Letter
9282 gc ; M         ; Mark                             # Mc | Me | Mn
9283 gc ; Mc        ; Spacing_Mark
9284 gc ; Mn        ; Nonspacing_Mark
9285 gc ; N         ; Number                           # Nd | Nl | No
9286 gc ; Nd        ; Decimal_Number
9287 gc ; No        ; Other_Number
9288 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
9289 gc ; Pd        ; Dash_Punctuation
9290 gc ; Pe        ; Close_Punctuation
9291 gc ; Po        ; Other_Punctuation
9292 gc ; Ps        ; Open_Punctuation
9293 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
9294 gc ; Sc        ; Currency_Symbol
9295 gc ; Sm        ; Math_Symbol
9296 gc ; So        ; Other_Symbol
9297 gc ; Z         ; Separator                        # Zl | Zp | Zs
9298 gc ; Zl        ; Line_Separator
9299 gc ; Zp        ; Paragraph_Separator
9300 gc ; Zs        ; Space_Separator
9301
9302 nt ; de        ; Decimal
9303 nt ; di        ; Digit
9304 nt ; n/a       ; None
9305 nt ; nu        ; Numeric
9306 END
9307
9308     if (-e 'ArabicShaping.txt') {
9309         push @return, split /\n/, <<'END';
9310 jg ; n/a       ; AIN
9311 jg ; n/a       ; ALEF
9312 jg ; n/a       ; DAL
9313 jg ; n/a       ; GAF
9314 jg ; n/a       ; LAM
9315 jg ; n/a       ; MEEM
9316 jg ; n/a       ; NO_JOINING_GROUP
9317 jg ; n/a       ; NOON
9318 jg ; n/a       ; QAF
9319 jg ; n/a       ; SAD
9320 jg ; n/a       ; SEEN
9321 jg ; n/a       ; TAH
9322 jg ; n/a       ; WAW
9323
9324 jt ; C         ; Join_Causing
9325 jt ; D         ; Dual_Joining
9326 jt ; L         ; Left_Joining
9327 jt ; R         ; Right_Joining
9328 jt ; U         ; Non_Joining
9329 jt ; T         ; Transparent
9330 END
9331         if ($v_version ge v3.0.0) {
9332             push @return, split /\n/, <<'END';
9333 jg ; n/a       ; ALAPH
9334 jg ; n/a       ; BEH
9335 jg ; n/a       ; BETH
9336 jg ; n/a       ; DALATH_RISH
9337 jg ; n/a       ; E
9338 jg ; n/a       ; FEH
9339 jg ; n/a       ; FINAL_SEMKATH
9340 jg ; n/a       ; GAMAL
9341 jg ; n/a       ; HAH
9342 jg ; n/a       ; HAMZA_ON_HEH_GOAL
9343 jg ; n/a       ; HE
9344 jg ; n/a       ; HEH
9345 jg ; n/a       ; HEH_GOAL
9346 jg ; n/a       ; HETH
9347 jg ; n/a       ; KAF
9348 jg ; n/a       ; KAPH
9349 jg ; n/a       ; KNOTTED_HEH
9350 jg ; n/a       ; LAMADH
9351 jg ; n/a       ; MIM
9352 jg ; n/a       ; NUN
9353 jg ; n/a       ; PE
9354 jg ; n/a       ; QAPH
9355 jg ; n/a       ; REH
9356 jg ; n/a       ; REVERSED_PE
9357 jg ; n/a       ; SADHE
9358 jg ; n/a       ; SEMKATH
9359 jg ; n/a       ; SHIN
9360 jg ; n/a       ; SWASH_KAF
9361 jg ; n/a       ; TAW
9362 jg ; n/a       ; TEH_MARBUTA
9363 jg ; n/a       ; TETH
9364 jg ; n/a       ; YEH
9365 jg ; n/a       ; YEH_BARREE
9366 jg ; n/a       ; YEH_WITH_TAIL
9367 jg ; n/a       ; YUDH
9368 jg ; n/a       ; YUDH_HE
9369 jg ; n/a       ; ZAIN
9370 END
9371         }
9372     }
9373
9374
9375     if (-e 'EastAsianWidth.txt') {
9376         push @return, split /\n/, <<'END';
9377 ea ; A         ; Ambiguous
9378 ea ; F         ; Fullwidth
9379 ea ; H         ; Halfwidth
9380 ea ; N         ; Neutral
9381 ea ; Na        ; Narrow
9382 ea ; W         ; Wide
9383 END
9384     }
9385
9386     if (-e 'LineBreak.txt') {
9387         push @return, split /\n/, <<'END';
9388 lb ; AI        ; Ambiguous
9389 lb ; AL        ; Alphabetic
9390 lb ; B2        ; Break_Both
9391 lb ; BA        ; Break_After
9392 lb ; BB        ; Break_Before
9393 lb ; BK        ; Mandatory_Break
9394 lb ; CB        ; Contingent_Break
9395 lb ; CL        ; Close_Punctuation
9396 lb ; CM        ; Combining_Mark
9397 lb ; CR        ; Carriage_Return
9398 lb ; EX        ; Exclamation
9399 lb ; GL        ; Glue
9400 lb ; HY        ; Hyphen
9401 lb ; ID        ; Ideographic
9402 lb ; IN        ; Inseperable
9403 lb ; IS        ; Infix_Numeric
9404 lb ; LF        ; Line_Feed
9405 lb ; NS        ; Nonstarter
9406 lb ; NU        ; Numeric
9407 lb ; OP        ; Open_Punctuation
9408 lb ; PO        ; Postfix_Numeric
9409 lb ; PR        ; Prefix_Numeric
9410 lb ; QU        ; Quotation
9411 lb ; SA        ; Complex_Context
9412 lb ; SG        ; Surrogate
9413 lb ; SP        ; Space
9414 lb ; SY        ; Break_Symbols
9415 lb ; XX        ; Unknown
9416 lb ; ZW        ; ZWSpace
9417 END
9418     }
9419
9420     if (-e 'DNormalizationProps.txt') {
9421         push @return, split /\n/, <<'END';
9422 qc ; M         ; Maybe
9423 qc ; N         ; No
9424 qc ; Y         ; Yes
9425 END
9426     }
9427
9428     if (-e 'Scripts.txt') {
9429         push @return, split /\n/, <<'END';
9430 sc ; Arab      ; Arabic
9431 sc ; Armn      ; Armenian
9432 sc ; Beng      ; Bengali
9433 sc ; Bopo      ; Bopomofo
9434 sc ; Cans      ; Canadian_Aboriginal
9435 sc ; Cher      ; Cherokee
9436 sc ; Cyrl      ; Cyrillic
9437 sc ; Deva      ; Devanagari
9438 sc ; Dsrt      ; Deseret
9439 sc ; Ethi      ; Ethiopic
9440 sc ; Geor      ; Georgian
9441 sc ; Goth      ; Gothic
9442 sc ; Grek      ; Greek
9443 sc ; Gujr      ; Gujarati
9444 sc ; Guru      ; Gurmukhi
9445 sc ; Hang      ; Hangul
9446 sc ; Hani      ; Han
9447 sc ; Hebr      ; Hebrew
9448 sc ; Hira      ; Hiragana
9449 sc ; Ital      ; Old_Italic
9450 sc ; Kana      ; Katakana
9451 sc ; Khmr      ; Khmer
9452 sc ; Knda      ; Kannada
9453 sc ; Laoo      ; Lao
9454 sc ; Latn      ; Latin
9455 sc ; Mlym      ; Malayalam
9456 sc ; Mong      ; Mongolian
9457 sc ; Mymr      ; Myanmar
9458 sc ; Ogam      ; Ogham
9459 sc ; Orya      ; Oriya
9460 sc ; Qaai      ; Inherited
9461 sc ; Runr      ; Runic
9462 sc ; Sinh      ; Sinhala
9463 sc ; Syrc      ; Syriac
9464 sc ; Taml      ; Tamil
9465 sc ; Telu      ; Telugu
9466 sc ; Thaa      ; Thaana
9467 sc ; Thai      ; Thai
9468 sc ; Tibt      ; Tibetan
9469 sc ; Yiii      ; Yi
9470 sc ; Zyyy      ; Common
9471 END
9472     }
9473
9474     if ($v_version ge v2.0.0) {
9475         push @return, split /\n/, <<'END';
9476 dt ; com       ; compat
9477 dt ; nar       ; narrow
9478 dt ; sml       ; small
9479 dt ; vert      ; vertical
9480 dt ; wide      ; wide
9481
9482 gc ; Cf        ; Format
9483 gc ; Cs        ; Surrogate
9484 gc ; Lt        ; Titlecase_Letter
9485 gc ; Me        ; Enclosing_Mark
9486 gc ; Nl        ; Letter_Number
9487 gc ; Pc        ; Connector_Punctuation
9488 gc ; Sk        ; Modifier_Symbol
9489 END
9490     }
9491     if ($v_version ge v2.1.2) {
9492         push @return, "bc ; S         ; Segment_Separator\n";
9493     }
9494     if ($v_version ge v2.1.5) {
9495         push @return, split /\n/, <<'END';
9496 gc ; Pf        ; Final_Punctuation
9497 gc ; Pi        ; Initial_Punctuation
9498 END
9499     }
9500     if ($v_version ge v2.1.8) {
9501         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
9502     }
9503
9504     if ($v_version ge v3.0.0) {
9505         push @return, split /\n/, <<'END';
9506 bc ; AL        ; Arabic_Letter
9507 bc ; BN        ; Boundary_Neutral
9508 bc ; LRE       ; Left_To_Right_Embedding
9509 bc ; LRO       ; Left_To_Right_Override
9510 bc ; NSM       ; Nonspacing_Mark
9511 bc ; PDF       ; Pop_Directional_Format
9512 bc ; RLE       ; Right_To_Left_Embedding
9513 bc ; RLO       ; Right_To_Left_Override
9514
9515 ccc; 233; DB   ; Double_Below
9516 END
9517     }
9518
9519     if ($v_version ge v3.1.0) {
9520         push @return, "ccc; 226; R    ; Right\n";
9521     }
9522
9523     return @return;
9524 }
9525
9526 sub output_perl_charnames_line ($$) {
9527
9528     # Output the entries in Perl_charnames specially, using 5 digits instead
9529     # of four.  This makes the entries a constant length, and simplifies
9530     # charnames.pm which this table is for.  Unicode can have 6 digit
9531     # ordinals, but they are all private use or noncharacters which do not
9532     # have names, so won't be in this table.
9533
9534     return sprintf "%05X\t%s\n", $_[0], $_[1];
9535 }
9536
9537 { # Closure
9538     # This is used to store the range list of all the code points usable when
9539     # the little used $compare_versions feature is enabled.
9540     my $compare_versions_range_list;
9541
9542     # These are constants to the $property_info hash in this subroutine, to
9543     # avoid using a quoted-string which might have a typo.
9544     my $TYPE  = 'type';
9545     my $DEFAULT_MAP = 'default_map';
9546     my $DEFAULT_TABLE = 'default_table';
9547     my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
9548     my $MISSINGS = 'missings';
9549
9550     sub process_generic_property_file {
9551         # This processes a file containing property mappings and puts them
9552         # into internal map tables.  It should be used to handle any property
9553         # files that have mappings from a code point or range thereof to
9554         # something else.  This means almost all the UCD .txt files.
9555         # each_line_handlers() should be set to adjust the lines of these
9556         # files, if necessary, to what this routine understands:
9557         #
9558         # 0374          ; NFD_QC; N
9559         # 003C..003E    ; Math
9560         #
9561         # the fields are: "codepoint-range ; property; map"
9562         #
9563         # meaning the codepoints in the range all have the value 'map' under
9564         # 'property'.
9565         # Beginning and trailing white space in each field are not significant.
9566         # Note there is not a trailing semi-colon in the above.  A trailing
9567         # semi-colon means the map is a null-string.  An omitted map, as
9568         # opposed to a null-string, is assumed to be 'Y', based on Unicode
9569         # table syntax.  (This could have been hidden from this routine by
9570         # doing it in the $file object, but that would require parsing of the
9571         # line there, so would have to parse it twice, or change the interface
9572         # to pass this an array.  So not done.)
9573         #
9574         # The map field may begin with a sequence of commands that apply to
9575         # this range.  Each such command begins and ends with $CMD_DELIM.
9576         # These are used to indicate, for example, that the mapping for a
9577         # range has a non-default type.
9578         #
9579         # This loops through the file, calling it's next_line() method, and
9580         # then taking the map and adding it to the property's table.
9581         # Complications arise because any number of properties can be in the
9582         # file, in any order, interspersed in any way.  The first time a
9583         # property is seen, it gets information about that property and
9584         # caches it for quick retrieval later.  It also normalizes the maps
9585         # so that only one of many synonyms is stored.  The Unicode input
9586         # files do use some multiple synonyms.
9587
9588         my $file = shift;
9589         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9590
9591         my %property_info;               # To keep track of what properties
9592                                          # have already had entries in the
9593                                          # current file, and info about each,
9594                                          # so don't have to recompute.
9595         my $property_name;               # property currently being worked on
9596         my $property_type;               # and its type
9597         my $previous_property_name = ""; # name from last time through loop
9598         my $property_object;             # pointer to the current property's
9599                                          # object
9600         my $property_addr;               # the address of that object
9601         my $default_map;                 # the string that code points missing
9602                                          # from the file map to
9603         my $default_table;               # For non-string properties, a
9604                                          # reference to the match table that
9605                                          # will contain the list of code
9606                                          # points that map to $default_map.
9607
9608         # Get the next real non-comment line
9609         LINE:
9610         while ($file->next_line) {
9611
9612             # Default replacement type; means that if parts of the range have
9613             # already been stored in our tables, the new map overrides them if
9614             # they differ more than cosmetically
9615             my $replace = $IF_NOT_EQUIVALENT;
9616             my $map_type;            # Default type for the map of this range
9617
9618             #local $to_trace = 1 if main::DEBUG;
9619             trace $_ if main::DEBUG && $to_trace;
9620
9621             # Split the line into components
9622             my ($range, $property_name, $map, @remainder)
9623                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9624
9625             # If more or less on the line than we are expecting, warn and skip
9626             # the line
9627             if (@remainder) {
9628                 $file->carp_bad_line('Extra fields');
9629                 next LINE;
9630             }
9631             elsif ( ! defined $property_name) {
9632                 $file->carp_bad_line('Missing property');
9633                 next LINE;
9634             }
9635
9636             # Examine the range.
9637             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9638             {
9639                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9640                 next LINE;
9641             }
9642             my $low = hex $1;
9643             my $high = (defined $2) ? hex $2 : $low;
9644
9645             # For the very specialized case of comparing two Unicode
9646             # versions...
9647             if (DEBUG && $compare_versions) {
9648                 if ($property_name eq 'Age') {
9649
9650                     # Only allow code points at least as old as the version
9651                     # specified.
9652                     my $age = pack "C*", split(/\./, $map);        # v string
9653                     next LINE if $age gt $compare_versions;
9654                 }
9655                 else {
9656
9657                     # Again, we throw out code points younger than those of
9658                     # the specified version.  By now, the Age property is
9659                     # populated.  We use the intersection of each input range
9660                     # with this property to find what code points in it are
9661                     # valid.   To do the intersection, we have to convert the
9662                     # Age property map to a Range_list.  We only have to do
9663                     # this once.
9664                     if (! defined $compare_versions_range_list) {
9665                         my $age = property_ref('Age');
9666                         if (! -e 'DAge.txt') {
9667                             croak "Need to have 'DAge.txt' file to do version comparison";
9668                         }
9669                         elsif ($age->count == 0) {
9670                             croak "The 'Age' table is empty, but its file exists";
9671                         }
9672                         $compare_versions_range_list
9673                                         = Range_List->new(Initialize => $age);
9674                     }
9675
9676                     # An undefined map is always 'Y'
9677                     $map = 'Y' if ! defined $map;
9678
9679                     # Calculate the intersection of the input range with the
9680                     # code points that are known in the specified version
9681                     my @ranges = ($compare_versions_range_list
9682                                   & Range->new($low, $high))->ranges;
9683
9684                     # If the intersection is empty, throw away this range
9685                     next LINE unless @ranges;
9686
9687                     # Only examine the first range this time through the loop.
9688                     my $this_range = shift @ranges;
9689
9690                     # Put any remaining ranges in the queue to be processed
9691                     # later.  Note that there is unnecessary work here, as we
9692                     # will do the intersection again for each of these ranges
9693                     # during some future iteration of the LINE loop, but this
9694                     # code is not used in production.  The later intersections
9695                     # are guaranteed to not splinter, so this will not become
9696                     # an infinite loop.
9697                     my $line = join ';', $property_name, $map;
9698                     foreach my $range (@ranges) {
9699                         $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9700                                                             $range->start,
9701                                                             $range->end,
9702                                                             $line));
9703                     }
9704
9705                     # And process the first range, like any other.
9706                     $low = $this_range->start;
9707                     $high = $this_range->end;
9708                 }
9709             } # End of $compare_versions
9710
9711             # If changing to a new property, get the things constant per
9712             # property
9713             if ($previous_property_name ne $property_name) {
9714
9715                 $property_object = property_ref($property_name);
9716                 if (! defined $property_object) {
9717                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
9718                     next LINE;
9719                 }
9720                 { no overloading; $property_addr = pack 'J', $property_object; }
9721
9722                 # Defer changing names until have a line that is acceptable
9723                 # (the 'next' statement above means is unacceptable)
9724                 $previous_property_name = $property_name;
9725
9726                 # If not the first time for this property, retrieve info about
9727                 # it from the cache
9728                 if (defined ($property_info{$property_addr}{$TYPE})) {
9729                     $property_type = $property_info{$property_addr}{$TYPE};
9730                     $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
9731                     $map_type
9732                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
9733                     $default_table
9734                             = $property_info{$property_addr}{$DEFAULT_TABLE};
9735                 }
9736                 else {
9737
9738                     # Here, is the first time for this property.  Set up the
9739                     # cache.
9740                     $property_type = $property_info{$property_addr}{$TYPE}
9741                                    = $property_object->type;
9742                     $map_type
9743                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
9744                         = $property_object->pseudo_map_type;
9745
9746                     # The Unicode files are set up so that if the map is not
9747                     # defined, it is a binary property
9748                     if (! defined $map && $property_type != $BINARY) {
9749                         if ($property_type != $UNKNOWN
9750                             && $property_type != $NON_STRING)
9751                         {
9752                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
9753                         }
9754                         else {
9755                             $property_object->set_type($BINARY);
9756                             $property_type
9757                                 = $property_info{$property_addr}{$TYPE}
9758                                 = $BINARY;
9759                         }
9760                     }
9761
9762                     # Get any @missings default for this property.  This
9763                     # should precede the first entry for the property in the
9764                     # input file, and is located in a comment that has been
9765                     # stored by the Input_file class until we access it here.
9766                     # It's possible that there is more than one such line
9767                     # waiting for us; collect them all, and parse
9768                     my @missings_list = $file->get_missings
9769                                             if $file->has_missings_defaults;
9770                     foreach my $default_ref (@missings_list) {
9771                         my $default = $default_ref->[0];
9772                         my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
9773
9774                         # For string properties, the default is just what the
9775                         # file says, but non-string properties should already
9776                         # have set up a table for the default property value;
9777                         # use the table for these, so can resolve synonyms
9778                         # later to a single standard one.
9779                         if ($property_type == $STRING
9780                             || $property_type == $UNKNOWN)
9781                         {
9782                             $property_info{$addr}{$MISSINGS} = $default;
9783                         }
9784                         else {
9785                             $property_info{$addr}{$MISSINGS}
9786                                         = $property_object->table($default);
9787                         }
9788                     }
9789
9790                     # Finished storing all the @missings defaults in the input
9791                     # file so far.  Get the one for the current property.
9792                     my $missings = $property_info{$property_addr}{$MISSINGS};
9793
9794                     # But we likely have separately stored what the default
9795                     # should be.  (This is to accommodate versions of the
9796                     # standard where the @missings lines are absent or
9797                     # incomplete.)  Hopefully the two will match.  But check
9798                     # it out.
9799                     $default_map = $property_object->default_map;
9800
9801                     # If the map is a ref, it means that the default won't be
9802                     # processed until later, so undef it, so next few lines
9803                     # will redefine it to something that nothing will match
9804                     undef $default_map if ref $default_map;
9805
9806                     # Create a $default_map if don't have one; maybe a dummy
9807                     # that won't match anything.
9808                     if (! defined $default_map) {
9809
9810                         # Use any @missings line in the file.
9811                         if (defined $missings) {
9812                             if (ref $missings) {
9813                                 $default_map = $missings->full_name;
9814                                 $default_table = $missings;
9815                             }
9816                             else {
9817                                 $default_map = $missings;
9818                             }
9819
9820                             # And store it with the property for outside use.
9821                             $property_object->set_default_map($default_map);
9822                         }
9823                         else {
9824
9825                             # Neither an @missings nor a default map.  Create
9826                             # a dummy one, so won't have to test definedness
9827                             # in the main loop.
9828                             $default_map = '_Perl This will never be in a file
9829                                             from Unicode';
9830                         }
9831                     }
9832
9833                     # Here, we have $default_map defined, possibly in terms of
9834                     # $missings, but maybe not, and possibly is a dummy one.
9835                     if (defined $missings) {
9836
9837                         # Make sure there is no conflict between the two.
9838                         # $missings has priority.
9839                         if (ref $missings) {
9840                             $default_table
9841                                         = $property_object->table($default_map);
9842                             if (! defined $default_table
9843                                 || $default_table != $missings)
9844                             {
9845                                 if (! defined $default_table) {
9846                                     $default_table = $UNDEF;
9847                                 }
9848                                 $file->carp_bad_line(<<END
9849 The \@missings line for $property_name in $file says that missings default to
9850 $missings, but we expect it to be $default_table.  $missings used.
9851 END
9852                                 );
9853                                 $default_table = $missings;
9854                                 $default_map = $missings->full_name;
9855                             }
9856                             $property_info{$property_addr}{$DEFAULT_TABLE}
9857                                                         = $default_table;
9858                         }
9859                         elsif ($default_map ne $missings) {
9860                             $file->carp_bad_line(<<END
9861 The \@missings line for $property_name in $file says that missings default to
9862 $missings, but we expect it to be $default_map.  $missings used.
9863 END
9864                             );
9865                             $default_map = $missings;
9866                         }
9867                     }
9868
9869                     $property_info{$property_addr}{$DEFAULT_MAP}
9870                                                     = $default_map;
9871
9872                     # If haven't done so already, find the table corresponding
9873                     # to this map for non-string properties.
9874                     if (! defined $default_table
9875                         && $property_type != $STRING
9876                         && $property_type != $UNKNOWN)
9877                     {
9878                         $default_table = $property_info{$property_addr}
9879                                                         {$DEFAULT_TABLE}
9880                                     = $property_object->table($default_map);
9881                     }
9882                 } # End of is first time for this property
9883             } # End of switching properties.
9884
9885             # Ready to process the line.
9886             # The Unicode files are set up so that if the map is not defined,
9887             # it is a binary property with value 'Y'
9888             if (! defined $map) {
9889                 $map = 'Y';
9890             }
9891             else {
9892
9893                 # If the map begins with a special command to us (enclosed in
9894                 # delimiters), extract the command(s).
9895                 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9896                     my $command = $1;
9897                     if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
9898                         $replace = $1;
9899                     }
9900                     elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
9901                         $map_type = $1;
9902                     }
9903                     else {
9904                         $file->carp_bad_line("Unknown command line: '$1'");
9905                         next LINE;
9906                     }
9907                 }
9908             }
9909
9910             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9911             {
9912
9913                 # Here, we have a map to a particular code point, and the
9914                 # default map is to a code point itself.  If the range
9915                 # includes the particular code point, change that portion of
9916                 # the range to the default.  This makes sure that in the final
9917                 # table only the non-defaults are listed.
9918                 my $decimal_map = hex $map;
9919                 if ($low <= $decimal_map && $decimal_map <= $high) {
9920
9921                     # If the range includes stuff before or after the map
9922                     # we're changing, split it and process the split-off parts
9923                     # later.
9924                     if ($low < $decimal_map) {
9925                         $file->insert_adjusted_lines(
9926                                             sprintf("%04X..%04X; %s; %s",
9927                                                     $low,
9928                                                     $decimal_map - 1,
9929                                                     $property_name,
9930                                                     $map));
9931                     }
9932                     if ($high > $decimal_map) {
9933                         $file->insert_adjusted_lines(
9934                                             sprintf("%04X..%04X; %s; %s",
9935                                                     $decimal_map + 1,
9936                                                     $high,
9937                                                     $property_name,
9938                                                     $map));
9939                     }
9940                     $low = $high = $decimal_map;
9941                     $map = $CODE_POINT;
9942                 }
9943             }
9944
9945             # If we can tell that this is a synonym for the default map, use
9946             # the default one instead.
9947             if ($property_type != $STRING
9948                 && $property_type != $UNKNOWN)
9949             {
9950                 my $table = $property_object->table($map);
9951                 if (defined $table && $table == $default_table) {
9952                     $map = $default_map;
9953                 }
9954             }
9955
9956             # And figure out the map type if not known.
9957             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9958                 if ($map eq "") {   # Nulls are always $NULL map type
9959                     $map_type = $NULL;
9960                 } # Otherwise, non-strings, and those that don't allow
9961                   # $MULTI_CP, and those that aren't multiple code points are
9962                   # 0
9963                 elsif
9964                    (($property_type != $STRING && $property_type != $UNKNOWN)
9965                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9966                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
9967                 {
9968                     $map_type = 0;
9969                 }
9970                 else {
9971                     $map_type = $MULTI_CP;
9972                 }
9973             }
9974
9975             $property_object->add_map($low, $high,
9976                                         $map,
9977                                         Type => $map_type,
9978                                         Replace => $replace);
9979         } # End of loop through file's lines
9980
9981         return;
9982     }
9983 }
9984
9985 { # Closure for UnicodeData.txt handling
9986
9987     # This file was the first one in the UCD; its design leads to some
9988     # awkwardness in processing.  Here is a sample line:
9989     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9990     # The fields in order are:
9991     my $i = 0;            # The code point is in field 0, and is shifted off.
9992     my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
9993     my $CATEGORY = $i++;  # category (e.g. "Lu")
9994     my $CCC = $i++;       # Canonical combining class (e.g. "230")
9995     my $BIDI = $i++;      # directional class (e.g. "L")
9996     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
9997     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
9998     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9999                                          # Dual-use in this program; see below
10000     my $NUMERIC = $i++;   # numeric value
10001     my $MIRRORED = $i++;  # ? mirrored
10002     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
10003     my $COMMENT = $i++;   # iso comment
10004     my $UPPER = $i++;     # simple uppercase mapping
10005     my $LOWER = $i++;     # simple lowercase mapping
10006     my $TITLE = $i++;     # simple titlecase mapping
10007     my $input_field_count = $i;
10008
10009     # This routine in addition outputs these extra fields:
10010
10011     my $DECOMP_TYPE = $i++; # Decomposition type
10012
10013     # These fields are modifications of ones above, and are usually
10014     # suppressed; they must come last, as for speed, the loop upper bound is
10015     # normally set to ignore them
10016     my $NAME = $i++;        # This is the strict name field, not the one that
10017                             # charnames uses.
10018     my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
10019                             # by Unicode::Normalize
10020     my $last_field = $i - 1;
10021
10022     # All these are read into an array for each line, with the indices defined
10023     # above.  The empty fields in the example line above indicate that the
10024     # value is defaulted.  The handler called for each line of the input
10025     # changes these to their defaults.
10026
10027     # Here are the official names of the properties, in a parallel array:
10028     my @field_names;
10029     $field_names[$BIDI] = 'Bidi_Class';
10030     $field_names[$CATEGORY] = 'General_Category';
10031     $field_names[$CCC] = 'Canonical_Combining_Class';
10032     $field_names[$CHARNAME] = 'Perl_Charnames';
10033     $field_names[$COMMENT] = 'ISO_Comment';
10034     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
10035     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
10036     $field_names[$LOWER] = 'Lowercase_Mapping';
10037     $field_names[$MIRRORED] = 'Bidi_Mirrored';
10038     $field_names[$NAME] = 'Name';
10039     $field_names[$NUMERIC] = 'Numeric_Value';
10040     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
10041     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
10042     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
10043     $field_names[$TITLE] = 'Titlecase_Mapping';
10044     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
10045     $field_names[$UPPER] = 'Uppercase_Mapping';
10046
10047     # Some of these need a little more explanation:
10048     # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
10049     #   property, but is used in calculating the Numeric_Type.  Perl however,
10050     #   creates a file from this field, so a Perl property is created from it.
10051     # Similarly, the Other_Digit field is used only for calculating the
10052     #   Numeric_Type, and so it can be safely re-used as the place to store
10053     #   the value for Numeric_Type; hence it is referred to as
10054     #   $NUMERIC_TYPE_OTHER_DIGIT.
10055     # The input field named $PERL_DECOMPOSITION is a combination of both the
10056     #   decomposition mapping and its type.  Perl creates a file containing
10057     #   exactly this field, so it is used for that.  The two properties are
10058     #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
10059     #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
10060     #   output it), as Perl doesn't use it directly.
10061     # The input field named here $CHARNAME is used to construct the
10062     #   Perl_Charnames property, which is a combination of the Name property
10063     #   (which the input field contains), and the Unicode_1_Name property, and
10064     #   others from other files.  Since, the strict Name property is not used
10065     #   by Perl, this field is used for the table that Perl does use.  The
10066     #   strict Name property table is usually suppressed (unless the lists are
10067     #   changed to output it), so it is accumulated in a separate field,
10068     #   $NAME, which to save time is discarded unless the table is actually to
10069     #   be output
10070
10071     # This file is processed like most in this program.  Control is passed to
10072     # process_generic_property_file() which calls filter_UnicodeData_line()
10073     # for each input line.  This filter converts the input into line(s) that
10074     # process_generic_property_file() understands.  There is also a setup
10075     # routine called before any of the file is processed, and a handler for
10076     # EOF processing, all in this closure.
10077
10078     # A huge speed-up occurred at the cost of some added complexity when these
10079     # routines were altered to buffer the outputs into ranges.  Almost all the
10080     # lines of the input file apply to just one code point, and for most
10081     # properties, the map for the next code point up is the same as the
10082     # current one.  So instead of creating a line for each property for each
10083     # input line, filter_UnicodeData_line() remembers what the previous map
10084     # of a property was, and doesn't generate a line to pass on until it has
10085     # to, as when the map changes; and that passed-on line encompasses the
10086     # whole contiguous range of code points that have the same map for that
10087     # property.  This means a slight amount of extra setup, and having to
10088     # flush these buffers on EOF, testing if the maps have changed, plus
10089     # remembering state information in the closure.  But it means a lot less
10090     # real time in not having to change the data base for each property on
10091     # each line.
10092
10093     # Another complication is that there are already a few ranges designated
10094     # in the input.  There are two lines for each, with the same maps except
10095     # the code point and name on each line.  This was actually the hardest
10096     # thing to design around.  The code points in those ranges may actually
10097     # have real maps not given by these two lines.  These maps will either
10098     # be algorithmically determinable, or be in the extracted files furnished
10099     # with the UCD.  In the event of conflicts between these extracted files,
10100     # and this one, Unicode says that this one prevails.  But it shouldn't
10101     # prevail for conflicts that occur in these ranges.  The data from the
10102     # extracted files prevails in those cases.  So, this program is structured
10103     # so that those files are processed first, storing maps.  Then the other
10104     # files are processed, generally overwriting what the extracted files
10105     # stored.  But just the range lines in this input file are processed
10106     # without overwriting.  This is accomplished by adding a special string to
10107     # the lines output to tell process_generic_property_file() to turn off the
10108     # overwriting for just this one line.
10109     # A similar mechanism is used to tell it that the map is of a non-default
10110     # type.
10111
10112     sub setup_UnicodeData { # Called before any lines of the input are read
10113         my $file = shift;
10114         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10115
10116         # Create a new property specially located that is a combination of the
10117         # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10118         # Name_Alias properties.  (The final duplicates elements of the
10119         # first.)  A comment for it will later be constructed based on the
10120         # actual properties present and used
10121         $perl_charname = Property->new('Perl_Charnames',
10122                        Default_Map => "",
10123                        Directory => File::Spec->curdir(),
10124                        File => 'Name',
10125                        Fate => $INTERNAL_ONLY,
10126                        Perl_Extension => 1,
10127                        Range_Size_1 => \&output_perl_charnames_line,
10128                        Type => $STRING,
10129                        );
10130         $perl_charname->set_proxy_for('Name');
10131
10132         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
10133                                         Directory => File::Spec->curdir(),
10134                                         File => 'Decomposition',
10135                                         Format => $DECOMP_STRING_FORMAT,
10136                                         Fate => $INTERNAL_ONLY,
10137                                         Perl_Extension => 1,
10138                                         Default_Map => $CODE_POINT,
10139
10140                                         # normalize.pm can't cope with these
10141                                         Output_Range_Counts => 0,
10142
10143                                         # This is a specially formatted table
10144                                         # explicitly for normalize.pm, which
10145                                         # is expecting a particular format,
10146                                         # which means that mappings containing
10147                                         # multiple code points are in the main
10148                                         # body of the table
10149                                         Map_Type => $COMPUTE_NO_MULTI_CP,
10150                                         Type => $STRING,
10151                                         To_Output_Map => $INTERNAL_MAP,
10152                                         );
10153         $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
10154         $Perl_decomp->add_comment(join_lines(<<END
10155 This mapping is a combination of the Unicode 'Decomposition_Type' and
10156 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
10157 identical to the official Unicode 'Decomposition_Mapping' property except for
10158 two things:
10159  1) It omits the algorithmically determinable Hangul syllable decompositions,
10160 which normalize.pm handles algorithmically.
10161  2) It contains the decomposition type as well.  Non-canonical decompositions
10162 begin with a word in angle brackets, like <super>, which denotes the
10163 compatible decomposition type.  If the map does not begin with the <angle
10164 brackets>, the decomposition is canonical.
10165 END
10166         ));
10167
10168         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
10169                                         Default_Map => "",
10170                                         Perl_Extension => 1,
10171                                         Directory => $map_directory,
10172                                         Type => $STRING,
10173                                         To_Output_Map => $OUTPUT_ADJUSTED,
10174                                         );
10175         $Decimal_Digit->add_comment(join_lines(<<END
10176 This file gives the mapping of all code points which represent a single
10177 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
10178 points, and the mapping of each non-initial element of each range is actually
10179 not to "0", but to the offset that element has from its corresponding DIGIT 0.
10180 These code points are those that have Numeric_Type=Decimal; not special
10181 things, like subscripts nor Roman numerals.
10182 END
10183         ));
10184
10185         # These properties are not used for generating anything else, and are
10186         # usually not output.  By making them last in the list, we can just
10187         # change the high end of the loop downwards to avoid the work of
10188         # generating a table(s) that is/are just going to get thrown away.
10189         if (! property_ref('Decomposition_Mapping')->to_output_map
10190             && ! property_ref('Name')->to_output_map)
10191         {
10192             $last_field = min($NAME, $DECOMP_MAP) - 1;
10193         } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
10194             $last_field = $DECOMP_MAP;
10195         } elsif (property_ref('Name')->to_output_map) {
10196             $last_field = $NAME;
10197         }
10198         return;
10199     }
10200
10201     my $first_time = 1;                 # ? Is this the first line of the file
10202     my $in_range = 0;                   # ? Are we in one of the file's ranges
10203     my $previous_cp;                    # hex code point of previous line
10204     my $decimal_previous_cp = -1;       # And its decimal equivalent
10205     my @start;                          # For each field, the current starting
10206                                         # code point in hex for the range
10207                                         # being accumulated.
10208     my @fields;                         # The input fields;
10209     my @previous_fields;                # And those from the previous call
10210
10211     sub filter_UnicodeData_line {
10212         # Handle a single input line from UnicodeData.txt; see comments above
10213         # Conceptually this takes a single line from the file containing N
10214         # properties, and converts it into N lines with one property per line,
10215         # which is what the final handler expects.  But there are
10216         # complications due to the quirkiness of the input file, and to save
10217         # time, it accumulates ranges where the property values don't change
10218         # and only emits lines when necessary.  This is about an order of
10219         # magnitude fewer lines emitted.
10220
10221         my $file = shift;
10222         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10223
10224         # $_ contains the input line.
10225         # -1 in split means retain trailing null fields
10226         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10227
10228         #local $to_trace = 1 if main::DEBUG;
10229         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10230         if (@fields > $input_field_count) {
10231             $file->carp_bad_line('Extra fields');
10232             $_ = "";
10233             return;
10234         }
10235
10236         my $decimal_cp = hex $cp;
10237
10238         # We have to output all the buffered ranges when the next code point
10239         # is not exactly one after the previous one, which means there is a
10240         # gap in the ranges.
10241         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10242
10243         # The decomposition mapping field requires special handling.  It looks
10244         # like either:
10245         #
10246         # <compat> 0032 0020
10247         # 0041 0300
10248         #
10249         # The decomposition type is enclosed in <brackets>; if missing, it
10250         # means the type is canonical.  There are two decomposition mapping
10251         # tables: the one for use by Perl's normalize.pm has a special format
10252         # which is this field intact; the other, for general use is of
10253         # standard format.  In either case we have to find the decomposition
10254         # type.  Empty fields have None as their type, and map to the code
10255         # point itself
10256         if ($fields[$PERL_DECOMPOSITION] eq "") {
10257             $fields[$DECOMP_TYPE] = 'None';
10258             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10259         }
10260         else {
10261             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10262                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
10263             if (! defined $fields[$DECOMP_TYPE]) {
10264                 $fields[$DECOMP_TYPE] = 'Canonical';
10265                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10266             }
10267             else {
10268                 $fields[$DECOMP_MAP] = $map;
10269             }
10270         }
10271
10272         # The 3 numeric fields also require special handling.  The 2 digit
10273         # fields must be either empty or match the number field.  This means
10274         # that if it is empty, they must be as well, and the numeric type is
10275         # None, and the numeric value is 'Nan'.
10276         # The decimal digit field must be empty or match the other digit
10277         # field.  If the decimal digit field is non-empty, the code point is
10278         # a decimal digit, and the other two fields will have the same value.
10279         # If it is empty, but the other digit field is non-empty, the code
10280         # point is an 'other digit', and the number field will have the same
10281         # value as the other digit field.  If the other digit field is empty,
10282         # but the number field is non-empty, the code point is a generic
10283         # numeric type.
10284         if ($fields[$NUMERIC] eq "") {
10285             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10286                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10287             ) {
10288                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
10289             }
10290             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10291             $fields[$NUMERIC] = 'NaN';
10292         }
10293         else {
10294             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10295             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10296                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10297                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10298             }
10299             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10300                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10301                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10302             }
10303             else {
10304                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10305
10306                 # Rationals require extra effort.
10307                 register_fraction($fields[$NUMERIC])
10308                                                 if $fields[$NUMERIC] =~ qr{/};
10309             }
10310         }
10311
10312         # For the properties that have empty fields in the file, and which
10313         # mean something different from empty, change them to that default.
10314         # Certain fields just haven't been empty so far in any Unicode
10315         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10316         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
10317         # the defaults; which are very unlikely to ever change.
10318         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10319         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10320
10321         # UAX44 says that if title is empty, it is the same as whatever upper
10322         # is,
10323         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10324
10325         # There are a few pairs of lines like:
10326         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10327         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10328         # that define ranges.  These should be processed after the fields are
10329         # adjusted above, as they may override some of them; but mostly what
10330         # is left is to possibly adjust the $CHARNAME field.  The names of all the
10331         # paired lines start with a '<', but this is also true of '<control>,
10332         # which isn't one of these special ones.
10333         if ($fields[$CHARNAME] eq '<control>') {
10334
10335             # Some code points in this file have the pseudo-name
10336             # '<control>', but the official name for such ones is the null
10337             # string.  For charnames.pm, we use the Unicode version 1 name
10338             $fields[$NAME] = "";
10339             $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
10340
10341             # We had better not be in between range lines.
10342             if ($in_range) {
10343                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10344                 $in_range = 0;
10345             }
10346         }
10347         elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
10348
10349             # Here is a non-range line.  We had better not be in between range
10350             # lines.
10351             if ($in_range) {
10352                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10353                 $in_range = 0;
10354             }
10355             if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10356
10357                 # These are code points whose names end in their code points,
10358                 # which means the names are algorithmically derivable from the
10359                 # code points.  To shorten the output Name file, the algorithm
10360                 # for deriving these is placed in the file instead of each
10361                 # code point, so they have map type $CP_IN_NAME
10362                 $fields[$CHARNAME] = $CMD_DELIM
10363                                  . $MAP_TYPE_CMD
10364                                  . '='
10365                                  . $CP_IN_NAME
10366                                  . $CMD_DELIM
10367                                  . $fields[$CHARNAME];
10368             }
10369             $fields[$NAME] = $fields[$CHARNAME];
10370         }
10371         elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10372             $fields[$CHARNAME] = $fields[$NAME] = $1;
10373
10374             # Here we are at the beginning of a range pair.
10375             if ($in_range) {
10376                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
10377             }
10378             $in_range = 1;
10379
10380             # Because the properties in the range do not overwrite any already
10381             # in the db, we must flush the buffers of what's already there, so
10382             # they get handled in the normal scheme.
10383             $force_output = 1;
10384
10385         }
10386         elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10387             $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
10388             $_ = "";
10389             return;
10390         }
10391         else { # Here, we are at the last line of a range pair.
10392
10393             if (! $in_range) {
10394                 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
10395                 $_ = "";
10396                 return;
10397             }
10398             $in_range = 0;
10399
10400             $fields[$NAME] = $fields[$CHARNAME];
10401
10402             # Check that the input is valid: that the closing of the range is
10403             # the same as the beginning.
10404             foreach my $i (0 .. $last_field) {
10405                 next if $fields[$i] eq $previous_fields[$i];
10406                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
10407             }
10408
10409             # The processing differs depending on the type of range,
10410             # determined by its $CHARNAME
10411             if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
10412
10413                 # Check that the data looks right.
10414                 if ($decimal_previous_cp != $SBase) {
10415                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
10416                 }
10417                 if ($decimal_cp != $SBase + $SCount - 1) {
10418                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
10419                 }
10420
10421                 # The Hangul syllable range has a somewhat complicated name
10422                 # generation algorithm.  Each code point in it has a canonical
10423                 # decomposition also computable by an algorithm.  The
10424                 # perl decomposition map table built from these is used only
10425                 # by normalize.pm, which has the algorithm built in it, so the
10426                 # decomposition maps are not needed, and are large, so are
10427                 # omitted from it.  If the full decomposition map table is to
10428                 # be output, the decompositions are generated for it, in the
10429                 # EOF handling code for this input file.
10430
10431                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10432
10433                 # This range is stored in our internal structure with its
10434                 # own map type, different from all others.
10435                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10436                                         = $CMD_DELIM
10437                                           . $MAP_TYPE_CMD
10438                                           . '='
10439                                           . $HANGUL_SYLLABLE
10440                                           . $CMD_DELIM
10441                                           . $fields[$CHARNAME];
10442             }
10443             elsif ($fields[$CHARNAME] =~ /^CJK/) {
10444
10445                 # The name for these contains the code point itself, and all
10446                 # are defined to have the same base name, regardless of what
10447                 # is in the file.  They are stored in our internal structure
10448                 # with a map type of $CP_IN_NAME
10449                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10450                                         = $CMD_DELIM
10451                                            . $MAP_TYPE_CMD
10452                                            . '='
10453                                            . $CP_IN_NAME
10454                                            . $CMD_DELIM
10455                                            . 'CJK UNIFIED IDEOGRAPH';
10456
10457             }
10458             elsif ($fields[$CATEGORY] eq 'Co'
10459                      || $fields[$CATEGORY] eq 'Cs')
10460             {
10461                 # The names of all the code points in these ranges are set to
10462                 # null, as there are no names for the private use and
10463                 # surrogate code points.
10464
10465                 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
10466             }
10467             else {
10468                 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
10469             }
10470
10471             # The first line of the range caused everything else to be output,
10472             # and then its values were stored as the beginning values for the
10473             # next set of ranges, which this one ends.  Now, for each value,
10474             # add a command to tell the handler that these values should not
10475             # replace any existing ones in our database.
10476             foreach my $i (0 .. $last_field) {
10477                 $previous_fields[$i] = $CMD_DELIM
10478                                         . $REPLACE_CMD
10479                                         . '='
10480                                         . $NO
10481                                         . $CMD_DELIM
10482                                         . $previous_fields[$i];
10483             }
10484
10485             # And change things so it looks like the entire range has been
10486             # gone through with this being the final part of it.  Adding the
10487             # command above to each field will cause this range to be flushed
10488             # during the next iteration, as it guaranteed that the stored
10489             # field won't match whatever value the next one has.
10490             $previous_cp = $cp;
10491             $decimal_previous_cp = $decimal_cp;
10492
10493             # We are now set up for the next iteration; so skip the remaining
10494             # code in this subroutine that does the same thing, but doesn't
10495             # know about these ranges.
10496             $_ = "";
10497
10498             return;
10499         }
10500
10501         # On the very first line, we fake it so the code below thinks there is
10502         # nothing to output, and initialize so that when it does get output it
10503         # uses the first line's values for the lowest part of the range.
10504         # (One could avoid this by using peek(), but then one would need to
10505         # know the adjustments done above and do the same ones in the setup
10506         # routine; not worth it)
10507         if ($first_time) {
10508             $first_time = 0;
10509             @previous_fields = @fields;
10510             @start = ($cp) x scalar @fields;
10511             $decimal_previous_cp = $decimal_cp - 1;
10512         }
10513
10514         # For each field, output the stored up ranges that this code point
10515         # doesn't fit in.  Earlier we figured out if all ranges should be
10516         # terminated because of changing the replace or map type styles, or if
10517         # there is a gap between this new code point and the previous one, and
10518         # that is stored in $force_output.  But even if those aren't true, we
10519         # need to output the range if this new code point's value for the
10520         # given property doesn't match the stored range's.
10521         #local $to_trace = 1 if main::DEBUG;
10522         foreach my $i (0 .. $last_field) {
10523             my $field = $fields[$i];
10524             if ($force_output || $field ne $previous_fields[$i]) {
10525
10526                 # Flush the buffer of stored values.
10527                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10528
10529                 # Start a new range with this code point and its value
10530                 $start[$i] = $cp;
10531                 $previous_fields[$i] = $field;
10532             }
10533         }
10534
10535         # Set the values for the next time.
10536         $previous_cp = $cp;
10537         $decimal_previous_cp = $decimal_cp;
10538
10539         # The input line has generated whatever adjusted lines are needed, and
10540         # should not be looked at further.
10541         $_ = "";
10542         return;
10543     }
10544
10545     sub EOF_UnicodeData {
10546         # Called upon EOF to flush the buffers, and create the Hangul
10547         # decomposition mappings if needed.
10548
10549         my $file = shift;
10550         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10551
10552         # Flush the buffers.
10553         foreach my $i (1 .. $last_field) {
10554             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10555         }
10556
10557         if (-e 'Jamo.txt') {
10558
10559             # The algorithm is published by Unicode, based on values in
10560             # Jamo.txt, (which should have been processed before this
10561             # subroutine), and the results left in %Jamo
10562             unless (%Jamo) {
10563                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
10564                 return;
10565             }
10566
10567             # If the full decomposition map table is being output, insert
10568             # into it the Hangul syllable mappings.  This is to avoid having
10569             # to publish a subroutine in it to compute them.  (which would
10570             # essentially be this code.)  This uses the algorithm published by
10571             # Unicode.
10572             if (property_ref('Decomposition_Mapping')->to_output_map) {
10573                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10574                     use integer;
10575                     my $SIndex = $S - $SBase;
10576                     my $L = $LBase + $SIndex / $NCount;
10577                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
10578                     my $T = $TBase + $SIndex % $TCount;
10579
10580                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10581                     my $decomposition = sprintf("%04X %04X", $L, $V);
10582                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10583                     $file->insert_adjusted_lines(
10584                                 sprintf("%04X; Decomposition_Mapping; %s",
10585                                         $S,
10586                                         $decomposition));
10587                 }
10588             }
10589         }
10590
10591         return;
10592     }
10593
10594     sub filter_v1_ucd {
10595         # Fix UCD lines in version 1.  This is probably overkill, but this
10596         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
10597         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
10598         #       removed.  This program retains them
10599         # 2)    didn't include ranges, which it should have, and which are now
10600         #       added in @corrected_lines below.  It was hand populated by
10601         #       taking the data from Version 2, verified by analyzing
10602         #       DAge.txt.
10603         # 3)    There is a syntax error in the entry for U+09F8 which could
10604         #       cause problems for utf8_heavy, and so is changed.  It's
10605         #       numeric value was simply a minus sign, without any number.
10606         #       (Eventually Unicode changed the code point to non-numeric.)
10607         # 4)    The decomposition types often don't match later versions
10608         #       exactly, and the whole syntax of that field is different; so
10609         #       the syntax is changed as well as the types to their later
10610         #       terminology.  Otherwise normalize.pm would be very unhappy
10611         # 5)    Many ccc classes are different.  These are left intact.
10612         # 6)    U+FF10 - U+FF19 are missing their numeric values in all three
10613         #       fields.  These are unchanged because it doesn't really cause
10614         #       problems for Perl.
10615         # 7)    A number of code points, such as controls, don't have their
10616         #       Unicode Version 1 Names in this file.  These are unchanged.
10617
10618         my @corrected_lines = split /\n/, <<'END';
10619 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
10620 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10621 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10622 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10623 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10624 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10625 END
10626
10627         my $file = shift;
10628         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10629
10630         #local $to_trace = 1 if main::DEBUG;
10631         trace $_ if main::DEBUG && $to_trace;
10632
10633         # -1 => retain trailing null fields
10634         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10635
10636         # At the first place that is wrong in the input, insert all the
10637         # corrections, replacing the wrong line.
10638         if ($code_point eq '4E00') {
10639             my @copy = @corrected_lines;
10640             $_ = shift @copy;
10641             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10642
10643             $file->insert_lines(@copy);
10644         }
10645
10646
10647         if ($fields[$NUMERIC] eq '-') {
10648             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
10649         }
10650
10651         if  ($fields[$PERL_DECOMPOSITION] ne "") {
10652
10653             # Several entries have this change to superscript 2 or 3 in the
10654             # middle.  Convert these to the modern version, which is to use
10655             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10656             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10657             # 'HHHH HHHH 00B3 HHHH'.
10658             # It turns out that all of these that don't have another
10659             # decomposition defined at the beginning of the line have the
10660             # <square> decomposition in later releases.
10661             if ($code_point ne '00B2' && $code_point ne '00B3') {
10662                 if  ($fields[$PERL_DECOMPOSITION]
10663                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10664                 {
10665                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10666                         $fields[$PERL_DECOMPOSITION] = '<square> '
10667                         . $fields[$PERL_DECOMPOSITION];
10668                     }
10669                 }
10670             }
10671
10672             # If is like '<+circled> 0052 <-circled>', convert to
10673             # '<circled> 0052'
10674             $fields[$PERL_DECOMPOSITION] =~
10675                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10676
10677             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10678             $fields[$PERL_DECOMPOSITION] =~
10679                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10680             or $fields[$PERL_DECOMPOSITION] =~
10681                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10682             or $fields[$PERL_DECOMPOSITION] =~
10683                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10684             or $fields[$PERL_DECOMPOSITION] =~
10685                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10686
10687             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10688             $fields[$PERL_DECOMPOSITION] =~
10689                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10690
10691             # Change names to modern form.
10692             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10693             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10694             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10695             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10696
10697             # One entry has weird braces
10698             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10699         }
10700
10701         $_ = join ';', $code_point, @fields;
10702         trace $_ if main::DEBUG && $to_trace;
10703         return;
10704     }
10705
10706     sub filter_v2_1_5_ucd {
10707         # A dozen entries in this 2.1.5 file had the mirrored and numeric
10708         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
10709         # column appears to be N, swap it back.
10710
10711         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10712         if ($fields[$NUMERIC] eq 'N') {
10713             $fields[$NUMERIC] = $fields[$MIRRORED];
10714             $fields[$MIRRORED] = 'N';
10715             $_ = join ';', $code_point, @fields;
10716         }
10717         return;
10718     }
10719
10720     sub filter_v6_ucd {
10721
10722         # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't
10723         # accepted that yet to allow for some deprecation cycles.
10724
10725         return if $_ !~ /^(?:0007|1F514|070F);/;
10726
10727         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10728         if ($code_point eq '0007') {
10729             $fields[$CHARNAME] = "";
10730         }
10731         elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10732                             # http://www.unicode.org/versions/corrigendum8.html
10733             $fields[$BIDI] = "AL";
10734         }
10735         elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
10736             $fields[$CHARNAME] = "";
10737         }
10738
10739         $_ = join ';', $code_point, @fields;
10740
10741         return;
10742     }
10743 } # End closure for UnicodeData
10744
10745 sub process_GCB_test {
10746
10747     my $file = shift;
10748     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10749
10750     while ($file->next_line) {
10751         push @backslash_X_tests, $_;
10752     }
10753
10754     return;
10755 }
10756
10757 sub process_NamedSequences {
10758     # NamedSequences.txt entries are just added to an array.  Because these
10759     # don't look like the other tables, they have their own handler.
10760     # An example:
10761     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10762     #
10763     # This just adds the sequence to an array for later handling
10764
10765     my $file = shift;
10766     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10767
10768     while ($file->next_line) {
10769         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10770         if (@remainder) {
10771             $file->carp_bad_line(
10772                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10773             next;
10774         }
10775
10776         # Note single \t in keeping with special output format of
10777         # Perl_charnames.  But it turns out that the code points don't have to
10778         # be 5 digits long, like the rest, based on the internal workings of
10779         # charnames.pm.  This could be easily changed for consistency.
10780         push @named_sequences, "$sequence\t$name";
10781     }
10782     return;
10783 }
10784
10785 { # Closure
10786
10787     my $first_range;
10788
10789     sub  filter_early_ea_lb {
10790         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
10791         # third field be the name of the code point, which can be ignored in
10792         # most cases.  But it can be meaningful if it marks a range:
10793         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10794         # 3400;W;<CJK Ideograph Extension A, First>
10795         #
10796         # We need to see the First in the example above to know it's a range.
10797         # They did not use the later range syntaxes.  This routine changes it
10798         # to use the modern syntax.
10799         # $1 is the Input_file object.
10800
10801         my @fields = split /\s*;\s*/;
10802         if ($fields[2] =~ /^<.*, First>/) {
10803             $first_range = $fields[0];
10804             $_ = "";
10805         }
10806         elsif ($fields[2] =~ /^<.*, Last>/) {
10807             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10808         }
10809         else {
10810             undef $first_range;
10811             $_ = "$fields[0]; $fields[1]";
10812         }
10813
10814         return;
10815     }
10816 }
10817
10818 sub filter_old_style_arabic_shaping {
10819     # Early versions used a different term for the later one.
10820
10821     my @fields = split /\s*;\s*/;
10822     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10823     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
10824     $_ = join ';', @fields;
10825     return;
10826 }
10827
10828 sub filter_arabic_shaping_line {
10829     # ArabicShaping.txt has entries that look like:
10830     # 062A; TEH; D; BEH
10831     # The field containing 'TEH' is not used.  The next field is Joining_Type
10832     # and the last is Joining_Group
10833     # This generates two lines to pass on, one for each property on the input
10834     # line.
10835
10836     my $file = shift;
10837     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10838
10839     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10840
10841     if (@fields > 4) {
10842         $file->carp_bad_line('Extra fields');
10843         $_ = "";
10844         return;
10845     }
10846
10847     $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10848     $_ = "$fields[0]; Joining_Type; $fields[2]";
10849
10850     return;
10851 }
10852
10853 { # Closure
10854     my $lc; # Table for lowercase mapping
10855     my $tc;
10856     my $uc;
10857
10858     sub setup_special_casing {
10859         # SpecialCasing.txt contains the non-simple case change mappings.  The
10860         # simple ones are in UnicodeData.txt, which should already have been
10861         # read in to the full property data structures, so as to initialize
10862         # these with the simple ones.  Then the SpecialCasing.txt entries
10863         # add or overwrite the ones which have different full mappings.
10864
10865         # This routine sees if the simple mappings are to be output, and if
10866         # so, copies what has already been put into the full mapping tables,
10867         # while they still contain only the simple mappings.
10868
10869         # The reason it is done this way is that the simple mappings are
10870         # probably not going to be output, so it saves work to initialize the
10871         # full tables with the simple mappings, and then overwrite those
10872         # relatively few entries in them that have different full mappings,
10873         # and thus skip the simple mapping tables altogether.
10874
10875         my $file= shift;
10876         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10877
10878         $lc = property_ref('lc');
10879         $tc = property_ref('tc');
10880         $uc = property_ref('uc');
10881
10882         # For each of the case change mappings...
10883         foreach my $full_table ($lc, $tc, $uc) {
10884             my $full_name = $full_table->name;
10885             unless (defined $full_table && ! $full_table->is_empty) {
10886                 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
10887             }
10888
10889             # Create a table in the old-style format and with the original
10890             # file name for backwards compatibility with applications that
10891             # read it directly.  The new tables contain both the simple and
10892             # full maps, and the old are missing simple maps when there is a
10893             # conflicting full one.  Probably it would have been ok to add
10894             # those to the legacy version, as was already done in 5.14 to the
10895             # case folding one, but this was not done, out of an abundance of
10896             # caution.  The tables are set up here before we deal with the
10897             # full maps so that as we handle those, we can override the simple
10898             # maps for them in the legacy table, and merely add them in the
10899             # new-style one.
10900             my $legacy = Property->new("Legacy_" . $full_table->full_name,
10901                                         File => $full_table->full_name =~
10902                                                             s/case_Mapping//r,
10903                                         Range_Size_1 => 1,
10904                                         Format => $HEX_FORMAT,
10905                                         Default_Map => $CODE_POINT,
10906                                         UCD => 0,
10907                                         Initialize => $full_table,
10908                                         To_Output_Map => $EXTERNAL_MAP,
10909             );
10910
10911             $full_table->add_comment(join_lines( <<END
10912 This file includes both the simple and full case changing maps.  The simple
10913 ones are in the main body of the table below, and the full ones adding to or
10914 overriding them are in the hash.
10915 END
10916             ));
10917
10918             # The simple version's name in each mapping merely has an 's' in
10919             # front of the full one's
10920             my $simple_name = 's' . $full_name;
10921             my $simple = property_ref($simple_name);
10922             $simple->initialize($full_table) if $simple->to_output_map();
10923
10924             unless ($simple->to_output_map()) {
10925                 $full_table->set_proxy_for($simple_name);
10926             }
10927         }
10928
10929         return;
10930     }
10931
10932     sub filter_special_casing_line {
10933         # Change the format of $_ from SpecialCasing.txt into something that
10934         # the generic handler understands.  Each input line contains three
10935         # case mappings.  This will generate three lines to pass to the
10936         # generic handler for each of those.
10937
10938         # The input syntax (after stripping comments and trailing white space
10939         # is like one of the following (with the final two being entries that
10940         # we ignore):
10941         # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10942         # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10943         # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10944         # Note the trailing semi-colon, unlike many of the input files.  That
10945         # means that there will be an extra null field generated by the split
10946
10947         my $file = shift;
10948         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10949
10950         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
10951                                               # fields
10952
10953         # field #4 is when this mapping is conditional.  If any of these get
10954         # implemented, it would be by hard-coding in the casing functions in
10955         # the Perl core, not through tables.  But if there is a new condition
10956         # we don't know about, output a warning.  We know about all the
10957         # conditions through 6.0
10958         if ($fields[4] ne "") {
10959             my @conditions = split ' ', $fields[4];
10960             if ($conditions[0] ne 'tr'  # We know that these languages have
10961                                         # conditions, and some are multiple
10962                 && $conditions[0] ne 'az'
10963                 && $conditions[0] ne 'lt'
10964
10965                 # And, we know about a single condition Final_Sigma, but
10966                 # nothing else.
10967                 && ($v_version gt v5.2.0
10968                     && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10969             {
10970                 $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10971             }
10972             elsif ($conditions[0] ne 'Final_Sigma') {
10973
10974                     # Don't print out a message for Final_Sigma, because we
10975                     # have hard-coded handling for it.  (But the standard
10976                     # could change what the rule should be, but it wouldn't
10977                     # show up here anyway.
10978
10979                     print "# SKIPPING Special Casing: $_\n"
10980                                                     if $verbosity >= $VERBOSE;
10981             }
10982             $_ = "";
10983             return;
10984         }
10985         elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10986             $file->carp_bad_line('Extra fields');
10987             $_ = "";
10988             return;
10989         }
10990
10991         my $decimal_code_point = hex $fields[0];
10992
10993         # Loop to handle each of the three mappings in the input line, in
10994         # order, with $i indicating the current field number.
10995         my $i = 0;
10996         for my $object ($lc, $tc, $uc) {
10997             $i++;   # First time through, $i = 0 ... 3rd time = 3
10998
10999             my $value = $object->value_of($decimal_code_point);
11000             $value = ($value eq $CODE_POINT)
11001                       ? $decimal_code_point
11002                       : hex $value;
11003
11004             # If this isn't a multi-character mapping, it should already have
11005             # been read in.
11006             if ($fields[$i] !~ / /) {
11007                 if ($value != hex $fields[$i]) {
11008                     Carp::my_carp("Bad news. UnicodeData.txt thinks "
11009                                   . $object->name
11010                                   . "(0x$fields[0]) is $value"
11011                                   . " and SpecialCasing.txt thinks it is "
11012                                   . hex $fields[$i]
11013                                   . ".  Good luck.  Proceeding anyway.");
11014                 }
11015             }
11016             else {
11017
11018                 # The mapping goes into both the legacy table, in which it
11019                 # replaces the simple one...
11020                 $file->insert_adjusted_lines("$fields[0]; Legacy_"
11021                                              . $object->full_name
11022                                              . "; $fields[$i]");
11023
11024                 # ... and, the The regular table, in which it is additional,
11025                 # beyond the simple mapping.
11026                 $file->insert_adjusted_lines("$fields[0]; "
11027                                              . $object->name
11028                                             . "; "
11029                                             . $CMD_DELIM
11030                                             . "$REPLACE_CMD=$MULTIPLE_BEFORE"
11031                                             . $CMD_DELIM
11032                                             . $fields[$i]);
11033             }
11034         }
11035
11036         # Everything has been handled by the insert_adjusted_lines()
11037         $_ = "";
11038
11039         return;
11040     }
11041 }
11042
11043 sub filter_old_style_case_folding {
11044     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
11045     # and later style.  Different letters were used in the earlier.
11046
11047     my $file = shift;
11048     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11049
11050     my @fields = split /\s*;\s*/;
11051     if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
11052         $fields[1] = 'I';
11053     }
11054     elsif ($fields[1] eq 'L') {
11055         $fields[1] = 'C';             # L => C always
11056     }
11057     elsif ($fields[1] eq 'E') {
11058         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
11059             $fields[1] = 'F'
11060         }
11061         else {
11062             $fields[1] = 'C'
11063         }
11064     }
11065     else {
11066         $file->carp_bad_line("Expecting L or E in second field");
11067         $_ = "";
11068         return;
11069     }
11070     $_ = join("; ", @fields) . ';';
11071     return;
11072 }
11073
11074 { # Closure for case folding
11075
11076     # Create the map for simple only if are going to output it, for otherwise
11077     # it takes no part in anything we do.
11078     my $to_output_simple;
11079     my $non_final_folds;
11080
11081     sub setup_case_folding($) {
11082         # Read in the case foldings in CaseFolding.txt.  This handles both
11083         # simple and full case folding.
11084
11085         $to_output_simple
11086                         = property_ref('Simple_Case_Folding')->to_output_map;
11087
11088         if (! $to_output_simple) {
11089             property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
11090         }
11091
11092         $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
11093                            Perl_Extension => 1,
11094                            Fate => $INTERNAL_ONLY,
11095                            Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
11096                            );
11097
11098         # If we ever wanted to show that these tables were combined, a new
11099         # property method could be created, like set_combined_props()
11100         property_ref('Case_Folding')->add_comment(join_lines( <<END
11101 This file includes both the simple and full case folding maps.  The simple
11102 ones are in the main body of the table below, and the full ones adding to or
11103 overriding them are in the hash.
11104 END
11105         ));
11106         return;
11107     }
11108
11109     sub filter_case_folding_line {
11110         # Called for each line in CaseFolding.txt
11111         # Input lines look like:
11112         # 0041; C; 0061; # LATIN CAPITAL LETTER A
11113         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
11114         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
11115         #
11116         # 'C' means that folding is the same for both simple and full
11117         # 'F' that it is only for full folding
11118         # 'S' that it is only for simple folding
11119         # 'T' is locale-dependent, and ignored
11120         # 'I' is a type of 'F' used in some early releases.
11121         # Note the trailing semi-colon, unlike many of the input files.  That
11122         # means that there will be an extra null field generated by the split
11123         # below, which we ignore and hence is not an error.
11124
11125         my $file = shift;
11126         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11127
11128         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
11129         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
11130             $file->carp_bad_line('Extra fields');
11131             $_ = "";
11132             return;
11133         }
11134
11135         if ($type eq 'T') {   # Skip Turkic case folding, is locale dependent
11136             $_ = "";
11137             return;
11138         }
11139
11140         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
11141         # I are all full foldings; S is single-char.  For S, there is always
11142         # an F entry, so we must allow multiple values for the same code
11143         # point.  Fortunately this table doesn't need further manipulation
11144         # which would preclude using multiple-values.  The S is now included
11145         # so that _swash_inversion_hash() is able to construct closures
11146         # without having to worry about F mappings.
11147         if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
11148             $_ = "$range; Case_Folding; "
11149                  . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
11150             if ($type eq 'F') {
11151                 my @string = split " ", $map;
11152                 for my $i (0 .. @string  - 1 -1) {
11153                     $non_final_folds->add_range(hex $string[$i], hex $string[$i]);
11154                 }
11155             }
11156         }
11157         else {
11158             $_ = "";
11159             $file->carp_bad_line('Expecting C F I S or T in second field');
11160         }
11161
11162         # C and S are simple foldings, but simple case folding is not needed
11163         # unless we explicitly want its map table output.
11164         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
11165             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
11166         }
11167
11168         return;
11169     }
11170
11171 } # End case fold closure
11172
11173 sub filter_jamo_line {
11174     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
11175     # from this file that is used in generating the Name property for Jamo
11176     # code points.  But, it also is used to convert early versions' syntax
11177     # into the modern form.  Here are two examples:
11178     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
11179     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
11180     #
11181     # The input is $_, the output is $_ filtered.
11182
11183     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
11184
11185     # Let the caller handle unexpected input.  In earlier versions, there was
11186     # a third field which is supposed to be a comment, but did not have a '#'
11187     # before it.
11188     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
11189
11190     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
11191                                 # beginning.
11192
11193     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
11194     $fields[1] = 'R' if $fields[0] eq '1105';
11195
11196     # Add to structure so can generate Names from it.
11197     my $cp = hex $fields[0];
11198     my $short_name = $fields[1];
11199     $Jamo{$cp} = $short_name;
11200     if ($cp <= $LBase + $LCount) {
11201         $Jamo_L{$short_name} = $cp - $LBase;
11202     }
11203     elsif ($cp <= $VBase + $VCount) {
11204         $Jamo_V{$short_name} = $cp - $VBase;
11205     }
11206     elsif ($cp <= $TBase + $TCount) {
11207         $Jamo_T{$short_name} = $cp - $TBase;
11208     }
11209     else {
11210         Carp::my_carp_bug("Unexpected Jamo code point in $_");
11211     }
11212
11213
11214     # Reassemble using just the first two fields to look like a typical
11215     # property file line
11216     $_ = "$fields[0]; $fields[1]";
11217
11218     return;
11219 }
11220
11221 sub register_fraction($) {
11222     # This registers the input rational number so that it can be passed on to
11223     # utf8_heavy.pl, both in rational and floating forms.
11224
11225     my $rational = shift;
11226
11227     my $float = eval $rational;
11228     $nv_floating_to_rational{$float} = $rational;
11229     return;
11230 }
11231
11232 sub filter_numeric_value_line {
11233     # DNumValues contains lines of a different syntax than the typical
11234     # property file:
11235     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
11236     #
11237     # This routine transforms $_ containing the anomalous syntax to the
11238     # typical, by filtering out the extra columns, and convert early version
11239     # decimal numbers to strings that look like rational numbers.
11240
11241     my $file = shift;
11242     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11243
11244     # Starting in 5.1, there is a rational field.  Just use that, omitting the
11245     # extra columns.  Otherwise convert the decimal number in the second field
11246     # to a rational, and omit extraneous columns.
11247     my @fields = split /\s*;\s*/, $_, -1;
11248     my $rational;
11249
11250     if ($v_version ge v5.1.0) {
11251         if (@fields != 4) {
11252             $file->carp_bad_line('Not 4 semi-colon separated fields');
11253             $_ = "";
11254             return;
11255         }
11256         $rational = $fields[3];
11257         $_ = join '; ', @fields[ 0, 3 ];
11258     }
11259     else {
11260
11261         # Here, is an older Unicode file, which has decimal numbers instead of
11262         # rationals in it.  Use the fraction to calculate the denominator and
11263         # convert to rational.
11264
11265         if (@fields != 2 && @fields != 3) {
11266             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
11267             $_ = "";
11268             return;
11269         }
11270
11271         my $codepoints = $fields[0];
11272         my $decimal = $fields[1];
11273         if ($decimal =~ s/\.0+$//) {
11274
11275             # Anything ending with a decimal followed by nothing but 0's is an
11276             # integer
11277             $_ = "$codepoints; $decimal";
11278             $rational = $decimal;
11279         }
11280         else {
11281
11282             my $denominator;
11283             if ($decimal =~ /\.50*$/) {
11284                 $denominator = 2;
11285             }
11286
11287             # Here have the hardcoded repeating decimals in the fraction, and
11288             # the denominator they imply.  There were only a few denominators
11289             # in the older Unicode versions of this file which this code
11290             # handles, so it is easy to convert them.
11291
11292             # The 4 is because of a round-off error in the Unicode 3.2 files
11293             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11294                 $denominator = 3;
11295             }
11296             elsif ($decimal =~ /\.[27]50*$/) {
11297                 $denominator = 4;
11298             }
11299             elsif ($decimal =~ /\.[2468]0*$/) {
11300                 $denominator = 5;
11301             }
11302             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11303                 $denominator = 6;
11304             }
11305             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11306                 $denominator = 8;
11307             }
11308             if ($denominator) {
11309                 my $sign = ($decimal < 0) ? "-" : "";
11310                 my $numerator = int((abs($decimal) * $denominator) + .5);
11311                 $rational = "$sign$numerator/$denominator";
11312                 $_ = "$codepoints; $rational";
11313             }
11314             else {
11315                 $file->carp_bad_line("Can't cope with number '$decimal'.");
11316                 $_ = "";
11317                 return;
11318             }
11319         }
11320     }
11321
11322     register_fraction($rational) if $rational =~ qr{/};
11323     return;
11324 }
11325
11326 { # Closure
11327     my %unihan_properties;
11328
11329     sub setup_unihan {
11330         # Do any special setup for Unihan properties.
11331
11332         # This property gives the wrong computed type, so override.
11333         my $usource = property_ref('kIRG_USource');
11334         $usource->set_type($STRING) if defined $usource;
11335
11336         # This property is to be considered binary (it says so in
11337         # http://www.unicode.org/reports/tr38/)
11338         my $iicore = property_ref('kIICore');
11339         if (defined $iicore) {
11340             $iicore->set_type($FORCED_BINARY);
11341             $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
11342
11343             # Unicode doesn't include the maps for this property, so don't
11344             # warn that they are missing.
11345             $iicore->set_pre_declared_maps(0);
11346             $iicore->add_comment(join_lines( <<END
11347 This property contains enum values, but Unicode UAX #38 says it should be
11348 interpreted as binary, so Perl creates tables for both 1) its enum values,
11349 plus 2) true/false tables in which it is considered true for all code points
11350 that have a non-null value
11351 END
11352             ));
11353         }
11354
11355         return;
11356     }
11357
11358     sub filter_unihan_line {
11359         # Change unihan db lines to look like the others in the db.  Here is
11360         # an input sample:
11361         #   U+341C        kCangjie        IEKN
11362
11363         # Tabs are used instead of semi-colons to separate fields; therefore
11364         # they may have semi-colons embedded in them.  Change these to periods
11365         # so won't screw up the rest of the code.
11366         s/;/./g;
11367
11368         # Remove lines that don't look like ones we accept.
11369         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11370             $_ = "";
11371             return;
11372         }
11373
11374         # Extract the property, and save a reference to its object.
11375         my $property = $1;
11376         if (! exists $unihan_properties{$property}) {
11377             $unihan_properties{$property} = property_ref($property);
11378         }
11379
11380         # Don't do anything unless the property is one we're handling, which
11381         # we determine by seeing if there is an object defined for it or not
11382         if (! defined $unihan_properties{$property}) {
11383             $_ = "";
11384             return;
11385         }
11386
11387         # Convert the tab separators to our standard semi-colons, and convert
11388         # the U+HHHH notation to the rest of the standard's HHHH
11389         s/\t/;/g;
11390         s/\b U \+ (?= $code_point_re )//xg;
11391
11392         #local $to_trace = 1 if main::DEBUG;
11393         trace $_ if main::DEBUG && $to_trace;
11394
11395         return;
11396     }
11397 }
11398
11399 sub filter_blocks_lines {
11400     # In the Blocks.txt file, the names of the blocks don't quite match the
11401     # names given in PropertyValueAliases.txt, so this changes them so they
11402     # do match:  Blanks and hyphens are changed into underscores.  Also makes
11403     # early release versions look like later ones
11404     #
11405     # $_ is transformed to the correct value.
11406
11407     my $file = shift;
11408         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11409
11410     if ($v_version lt v3.2.0) {
11411         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11412             $_ = "";
11413             return;
11414         }
11415
11416         # Old versions used a different syntax to mark the range.
11417         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11418     }
11419
11420     my @fields = split /\s*;\s*/, $_, -1;
11421     if (@fields != 2) {
11422         $file->carp_bad_line("Expecting exactly two fields");
11423         $_ = "";
11424         return;
11425     }
11426
11427     # Change hyphens and blanks in the block name field only
11428     $fields[1] =~ s/[ -]/_/g;
11429     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
11430
11431     $_ = join("; ", @fields);
11432     return;
11433 }
11434
11435 { # Closure
11436     my $current_property;
11437
11438     sub filter_old_style_proplist {
11439         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
11440         # was in a completely different syntax.  Ken Whistler of Unicode says
11441         # that it was something he used as an aid for his own purposes, but
11442         # was never an official part of the standard.  However, comments in
11443         # DAge.txt indicate that non-character code points were available in
11444         # the UCD as of 3.1.  It is unclear to me (khw) how they could be
11445         # there except through this file (but on the other hand, they first
11446         # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
11447         # not.  But the claim is that it was published as an aid to others who
11448         # might want some more information than was given in the official UCD
11449         # of the time.  Many of the properties in it were incorporated into
11450         # the later PropList.txt, but some were not.  This program uses this
11451         # early file to generate property tables that are otherwise not
11452         # accessible in the early UCD's, and most were probably not really
11453         # official at that time, so one could argue that it should be ignored,
11454         # and you can easily modify things to skip this.  And there are bugs
11455         # in this file in various versions.  (For example, the 2.1.9 version
11456         # removes from Alphabetic the CJK range starting at 4E00, and they
11457         # weren't added back in until 3.1.0.)  Many of this file's properties
11458         # were later sanctioned, so this code generates tables for those
11459         # properties that aren't otherwise in the UCD of the time but
11460         # eventually did become official, and throws away the rest.  Here is a
11461         # list of all the ones that are thrown away:
11462         #   Bidi=*                       duplicates UnicodeData.txt
11463         #   Combining                    never made into official property;
11464         #                                is \P{ccc=0}
11465         #   Composite                    never made into official property.
11466         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
11467         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
11468         #   Delimiter                    never made into official property;
11469         #                                removed in 3.0.1
11470         #   Format Control               never made into official property;
11471         #                                similar to gc=cf
11472         #   High Surrogate               duplicates Blocks.txt
11473         #   Ignorable Control            never made into official property;
11474         #                                similar to di=y
11475         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
11476         #   Left of Pair                 never made into official property;
11477         #   Line Separator               duplicates UnicodeData.txt: gc=zl
11478         #   Low Surrogate                duplicates Blocks.txt
11479         #   Non-break                    was actually listed as a property
11480         #                                in 3.2, but without any code
11481         #                                points.  Unicode denies that this
11482         #                                was ever an official property
11483         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
11484         #   Numeric                      duplicates UnicodeData.txt: gc=cc
11485         #   Paired Punctuation           never made into official property;
11486         #                                appears to be gc=ps + gc=pe
11487         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
11488         #   Private Use                  duplicates UnicodeData.txt: gc=co
11489         #   Private Use High Surrogate   duplicates Blocks.txt
11490         #   Punctuation                  duplicates UnicodeData.txt: gc=p
11491         #   Space                        different definition than eventual
11492         #                                one.
11493         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
11494         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cc
11495         #   Zero-width                   never made into official property;
11496         #                                subset of gc=cf
11497         # Most of the properties have the same names in this file as in later
11498         # versions, but a couple do not.
11499         #
11500         # This subroutine filters $_, converting it from the old style into
11501         # the new style.  Here's a sample of the old-style
11502         #
11503         #   *******************************************
11504         #
11505         #   Property dump for: 0x100000A0 (Join Control)
11506         #
11507         #   200C..200D  (2 chars)
11508         #
11509         # In the example, the property is "Join Control".  It is kept in this
11510         # closure between calls to the subroutine.  The numbers beginning with
11511         # 0x were internal to Ken's program that generated this file.
11512
11513         # If this line contains the property name, extract it.
11514         if (/^Property dump for: [^(]*\((.*)\)/) {
11515             $_ = $1;
11516
11517             # Convert white space to underscores.
11518             s/ /_/g;
11519
11520             # Convert the few properties that don't have the same name as
11521             # their modern counterparts
11522             s/Identifier_Part/ID_Continue/
11523             or s/Not_a_Character/NChar/;
11524
11525             # If the name matches an existing property, use it.
11526             if (defined property_ref($_)) {
11527                 trace "new property=", $_ if main::DEBUG && $to_trace;
11528                 $current_property = $_;
11529             }
11530             else {        # Otherwise discard it
11531                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11532                 undef $current_property;
11533             }
11534             $_ = "";    # The property is saved for the next lines of the
11535                         # file, but this defining line is of no further use,
11536                         # so clear it so that the caller won't process it
11537                         # further.
11538         }
11539         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11540
11541             # Here, the input line isn't a header defining a property for the
11542             # following section, and either we aren't in such a section, or
11543             # the line doesn't look like one that defines the code points in
11544             # such a section.  Ignore this line.
11545             $_ = "";
11546         }
11547         else {
11548
11549             # Here, we have a line defining the code points for the current
11550             # stashed property.  Anything starting with the first blank is
11551             # extraneous.  Otherwise, it should look like a normal range to
11552             # the caller.  Append the property name so that it looks just like
11553             # a modern PropList entry.
11554
11555             $_ =~ s/\s.*//;
11556             $_ .= "; $current_property";
11557         }
11558         trace $_ if main::DEBUG && $to_trace;
11559         return;
11560     }
11561 } # End closure for old style proplist
11562
11563 sub filter_old_style_normalization_lines {
11564     # For early releases of Unicode, the lines were like:
11565     #        74..2A76    ; NFKD_NO
11566     # For later releases this became:
11567     #        74..2A76    ; NFKD_QC; N
11568     # Filter $_ to look like those in later releases.
11569     # Similarly for MAYBEs
11570
11571     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11572
11573     # Also, the property FC_NFKC was abbreviated to FNC
11574     s/FNC/FC_NFKC/;
11575     return;
11576 }
11577
11578 sub setup_script_extensions {
11579     # The Script_Extensions property starts out with a clone of the Script
11580     # property.
11581
11582     my $scx = property_ref("Script_Extensions");
11583     $scx = Property->new("scx", Full_Name => "Script_Extensions")
11584                                                             if ! defined $scx;
11585     $scx->_set_format($STRING_WHITE_SPACE_LIST);
11586     $scx->initialize($script);
11587     $scx->set_default_map($script->default_map);
11588     $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
11589     $scx->add_comment(join_lines( <<END
11590 The values for code points that appear in one script are just the same as for
11591 the 'Script' property.  Likewise the values for those that appear in many
11592 scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
11593 values of code points that appear in a few scripts are a space separated list
11594 of those scripts.
11595 END
11596     ));
11597
11598     # Initialize scx's tables and the aliases for them to be the same as sc's
11599     foreach my $table ($script->tables) {
11600         my $scx_table = $scx->add_match_table($table->name,
11601                                 Full_Name => $table->full_name);
11602         foreach my $alias ($table->aliases) {
11603             $scx_table->add_alias($alias->name);
11604         }
11605     }
11606 }
11607
11608 sub  filter_script_extensions_line {
11609     # The Scripts file comes with the full name for the scripts; the
11610     # ScriptExtensions, with the short name.  The final mapping file is a
11611     # combination of these, and without adjustment, would have inconsistent
11612     # entries.  This filters the latter file to convert to full names.
11613     # Entries look like this:
11614     # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
11615
11616     my @fields = split /\s*;\s*/;
11617     my @full_names;
11618     foreach my $short_name (split " ", $fields[1]) {
11619         push @full_names, $script->table($short_name)->full_name;
11620     }
11621     $fields[1] = join " ", @full_names;
11622     $_ = join "; ", @fields;
11623
11624     return;
11625 }
11626
11627 sub setup_early_name_alias {
11628     my $aliases = property_ref('Name_Alias');
11629     $aliases = Property->new('Name_Alias') if ! defined $aliases;
11630
11631     # Before 6.0, this wasn't a problem, and after it, this alias is part of
11632     # the Unicode-delivered file.
11633     $aliases->add_map(7, 7, "ALERT: control") if $v_version eq v6.0.0;
11634     return;
11635 }
11636
11637 sub filter_later_version_name_alias_line {
11638
11639     # This file has an extra entry per line for the alias type.  This is
11640     # handled by creating a compound entry: "$alias: $type";  First, split
11641     # the line into components.
11642     my ($range, $alias, $type, @remainder)
11643         = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11644
11645     # This file contains multiple entries for some components, so tell the
11646     # downstream code to allow this in our internal tables; the
11647     # $MULTIPLE_AFTER preserves the input ordering.
11648     $_ = join ";", $range, $CMD_DELIM
11649                            . $REPLACE_CMD
11650                            . '='
11651                            . $MULTIPLE_AFTER
11652                            . $CMD_DELIM
11653                            . "$alias: $type",
11654                    @remainder;
11655     return;
11656 }
11657
11658 sub filter_early_version_name_alias_line {
11659
11660     # Early versions did not have the trailing alias type field; implicitly it
11661     # was 'correction'
11662     $_ .= "; correction";
11663     filter_later_version_name_alias_line;
11664     return;
11665 }
11666
11667 sub finish_Unicode() {
11668     # This routine should be called after all the Unicode files have been read
11669     # in.  It:
11670     # 1) Adds the mappings for code points missing from the files which have
11671     #    defaults specified for them.
11672     # 2) At this this point all mappings are known, so it computes the type of
11673     #    each property whose type hasn't been determined yet.
11674     # 3) Calculates all the regular expression match tables based on the
11675     #    mappings.
11676     # 3) Calculates and adds the tables which are defined by Unicode, but
11677     #    which aren't derived by them, and certain derived tables that Perl
11678     #    uses.
11679
11680     # For each property, fill in any missing mappings, and calculate the re
11681     # match tables.  If a property has more than one missing mapping, the
11682     # default is a reference to a data structure, and requires data from other
11683     # properties to resolve.  The sort is used to cause these to be processed
11684     # last, after all the other properties have been calculated.
11685     # (Fortunately, the missing properties so far don't depend on each other.)
11686     foreach my $property
11687         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
11688         property_ref('*'))
11689     {
11690         # $perl has been defined, but isn't one of the Unicode properties that
11691         # need to be finished up.
11692         next if $property == $perl;
11693
11694         # Nor do we need to do anything with properties that aren't going to
11695         # be output.
11696         next if $property->fate == $SUPPRESSED;
11697
11698         # Handle the properties that have more than one possible default
11699         if (ref $property->default_map) {
11700             my $default_map = $property->default_map;
11701
11702             # These properties have stored in the default_map:
11703             # One or more of:
11704             #   1)  A default map which applies to all code points in a
11705             #       certain class
11706             #   2)  an expression which will evaluate to the list of code
11707             #       points in that class
11708             # And
11709             #   3) the default map which applies to every other missing code
11710             #      point.
11711             #
11712             # Go through each list.
11713             while (my ($default, $eval) = $default_map->get_next_defaults) {
11714
11715                 # Get the class list, and intersect it with all the so-far
11716                 # unspecified code points yielding all the code points
11717                 # in the class that haven't been specified.
11718                 my $list = eval $eval;
11719                 if ($@) {
11720                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
11721                     last;
11722                 }
11723
11724                 # Narrow down the list to just those code points we don't have
11725                 # maps for yet.
11726                 $list = $list & $property->inverse_list;
11727
11728                 # Add mappings to the property for each code point in the list
11729                 foreach my $range ($list->ranges) {
11730                     $property->add_map($range->start, $range->end, $default,
11731                     Replace => $CROAK);
11732                 }
11733             }
11734
11735             # All remaining code points have the other mapping.  Set that up
11736             # so the normal single-default mapping code will work on them
11737             $property->set_default_map($default_map->other_default);
11738
11739             # And fall through to do that
11740         }
11741
11742         # We should have enough data now to compute the type of the property.
11743         $property->compute_type;
11744         my $property_type = $property->type;
11745
11746         next if ! $property->to_create_match_tables;
11747
11748         # Here want to create match tables for this property
11749
11750         # The Unicode db always (so far, and they claim into the future) have
11751         # the default for missing entries in binary properties be 'N' (unless
11752         # there is a '@missing' line that specifies otherwise)
11753         if ($property_type == $BINARY && ! defined $property->default_map) {
11754             $property->set_default_map('N');
11755         }
11756
11757         # Add any remaining code points to the mapping, using the default for
11758         # missing code points.
11759         my $default_table;
11760         if (defined (my $default_map = $property->default_map)) {
11761
11762             # Make sure there is a match table for the default
11763             if (! defined ($default_table = $property->table($default_map))) {
11764                 $default_table = $property->add_match_table($default_map);
11765             }
11766
11767             # And, if the property is binary, the default table will just
11768             # be the complement of the other table.
11769             if ($property_type == $BINARY) {
11770                 my $non_default_table;
11771
11772                 # Find the non-default table.
11773                 for my $table ($property->tables) {
11774                     next if $table == $default_table;
11775                     $non_default_table = $table;
11776                 }
11777                 $default_table->set_complement($non_default_table);
11778             }
11779             else {
11780
11781                 # This fills in any missing values with the default.  It's not
11782                 # necessary to do this with binary properties, as the default
11783                 # is defined completely in terms of the Y table.
11784                 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
11785                                    $default_map, Replace => $NO);
11786             }
11787         }
11788
11789         # Have all we need to populate the match tables.
11790         my $property_name = $property->name;
11791         my $maps_should_be_defined = $property->pre_declared_maps;
11792         foreach my $range ($property->ranges) {
11793             my $map = $range->value;
11794             my $table = $property->table($map);
11795             if (! defined $table) {
11796
11797                 # Integral and rational property values are not necessarily
11798                 # defined in PropValueAliases, but whether all the other ones
11799                 # should be depends on the property.
11800                 if ($maps_should_be_defined
11801                     && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
11802                 {
11803                     Carp::my_carp("Table '$property_name=$map' should have been defined.  Defining it now.")
11804                 }
11805                 $table = $property->add_match_table($map);
11806             }
11807
11808             next if $table->complement != 0;    # Don't need to populate these
11809             $table->add_range($range->start, $range->end);
11810         }
11811
11812         # A forced binary property has additional true/false tables which
11813         # should have been set up when it was forced into binary.  The false
11814         # table matches exactly the same set as the property's default table.
11815         # The true table matches the complement of that.  The false table is
11816         # not the same as an additional set of aliases on top of the default
11817         # table, so use 'set_equivalent_to'.  If it were implemented as
11818         # additional aliases, various things would have to be adjusted, but
11819         # especially, if the user wants to get a list of names for the table
11820         # using Unicode::UCD::prop_value_aliases(), s/he should get a
11821         # different set depending on whether they want the default table or
11822         # the false table.
11823         if ($property_type == $FORCED_BINARY) {
11824             $property->table('N')->set_equivalent_to($default_table,
11825                                                      Related => 1);
11826             $property->table('Y')->set_complement($default_table);
11827         }
11828
11829         # For Perl 5.6 compatibility, all properties matchable in regexes can
11830         # have an optional 'Is_' prefix.  This is now done in utf8_heavy.pl.
11831         # But warn if this creates a conflict with a (new) Unicode property
11832         # name, although it appears that Unicode has made a decision never to
11833         # begin a property name with 'Is_', so this shouldn't happen.
11834         foreach my $alias ($property->aliases) {
11835             my $Is_name = 'Is_' . $alias->name;
11836             if (defined (my $pre_existing = property_ref($Is_name))) {
11837                 Carp::my_carp(<<END
11838 There is already an alias named $Is_name (from " . $pre_existing . "), so
11839 creating one for $property won't work.  This is bad news.  If it is not too
11840 late, get Unicode to back off.  Otherwise go back to the old scheme (findable
11841 from the git blame log for this area of the code that suppressed individual
11842 aliases that conflict with the new Unicode names.  Proceeding anyway.
11843 END
11844                 );
11845             }
11846         } # End of loop through aliases for this property
11847     } # End of loop through all Unicode properties.
11848
11849     # Fill in the mappings that Unicode doesn't completely furnish.  First the
11850     # single letter major general categories.  If Unicode were to start
11851     # delivering the values, this would be redundant, but better that than to
11852     # try to figure out if should skip and not get it right.  Ths could happen
11853     # if a new major category were to be introduced, and the hard-coded test
11854     # wouldn't know about it.
11855     # This routine depends on the standard names for the general categories
11856     # being what it thinks they are, like 'Cn'.  The major categories are the
11857     # union of all the general category tables which have the same first
11858     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11859     foreach my $minor_table ($gc->tables) {
11860         my $minor_name = $minor_table->name;
11861         next if length $minor_name == 1;
11862         if (length $minor_name != 2) {
11863             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
11864             next;
11865         }
11866
11867         my $major_name = uc(substr($minor_name, 0, 1));
11868         my $major_table = $gc->table($major_name);
11869         $major_table += $minor_table;
11870     }
11871
11872     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
11873     # defines it as LC)
11874     my $LC = $gc->table('LC');
11875     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
11876     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
11877
11878
11879     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11880                          # deliver the correct values in it
11881         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11882
11883         # Lt not in release 1.
11884         if (defined $gc->table('Lt')) {
11885             $LC += $gc->table('Lt');
11886             $gc->table('Lt')->set_caseless_equivalent($LC);
11887         }
11888     }
11889     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11890
11891     $gc->table('Ll')->set_caseless_equivalent($LC);
11892     $gc->table('Lu')->set_caseless_equivalent($LC);
11893
11894     my $Cs = $gc->table('Cs');
11895
11896
11897     # Folding information was introduced later into Unicode data.  To get
11898     # Perl's case ignore (/i) to work at all in releases that don't have
11899     # folding, use the best available alternative, which is lower casing.
11900     my $fold = property_ref('Simple_Case_Folding');
11901     if ($fold->is_empty) {
11902         $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11903         $fold->add_note(join_lines(<<END
11904 WARNING: This table uses lower case as a substitute for missing fold
11905 information
11906 END
11907         ));
11908     }
11909
11910     # Multiple-character mapping was introduced later into Unicode data.  If
11911     # missing, use the single-characters maps as best available alternative
11912     foreach my $map (qw {   Uppercase_Mapping
11913                             Lowercase_Mapping
11914                             Titlecase_Mapping
11915                             Case_Folding
11916                         } )
11917     {
11918         my $full = property_ref($map);
11919         if ($full->is_empty) {
11920             my $simple = property_ref('Simple_' . $map);
11921             $full->initialize($simple);
11922             $full->add_comment($simple->comment) if ($simple->comment);
11923             $full->add_note(join_lines(<<END
11924 WARNING: This table uses simple mapping (single-character only) as a
11925 substitute for missing multiple-character information
11926 END
11927             ));
11928         }
11929     }
11930
11931     # Create digit and case fold tables with the original file names for
11932     # backwards compatibility with applications that read them directly.
11933     my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
11934                               Default_Map => "",
11935                               Perl_Extension => 1,
11936                               File => 'Digit',    # Trad. location
11937                               Directory => $map_directory,
11938                               UCD => 0,
11939                               Type => $STRING,
11940                               To_Output_Map => $EXTERNAL_MAP,
11941                               Range_Size_1 => 1,
11942                               Initialize => property_ref('Perl_Decimal_Digit'),
11943                             );
11944     $Digit->add_comment(join_lines(<<END
11945 This file gives the mapping of all code points which represent a single
11946 decimal digit [0-9] to their respective digits.  For example, the code point
11947 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
11948 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
11949 numerals.
11950 END
11951     ));
11952
11953     Property->new('Legacy_Case_Folding',
11954                     File => "Fold",
11955                     Directory => $map_directory,
11956                     Default_Map => $CODE_POINT,
11957                     UCD => 0,
11958                     Range_Size_1 => 1,
11959                     Type => $STRING,
11960                     To_Output_Map => $EXTERNAL_MAP,
11961                     Format => $HEX_FORMAT,
11962                     Initialize => property_ref('cf'),
11963     );
11964
11965     # The Script_Extensions property started out as a clone of the Script
11966     # property.  But processing its data file caused some elements to be
11967     # replaced with different data.  (These elements were for the Common and
11968     # Inherited properties.)  This data is a qw() list of all the scripts that
11969     # the code points in the given range are in.  An example line is:
11970     # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
11971     #
11972     # The code above has created a new match table named "Arab Syrc Thaa"
11973     # which contains 060C.  (The cloned table started out with this code point
11974     # mapping to "Common".)  Now we add 060C to each of the Arab, Syrc, and
11975     # Thaa match tables.  Then we delete the now spurious "Arab Syrc Thaa"
11976     # match table.  This is repeated for all these tables and ranges.  The map
11977     # data is retained in the map table for reference, but the spurious match
11978     # tables are deleted.
11979
11980     my $scx = property_ref("Script_Extensions");
11981     if (defined $scx) {
11982         foreach my $table ($scx->tables) {
11983             next unless $table->name =~ /\s/;   # All the new and only the new
11984                                                 # tables have a space in their
11985                                                 # names
11986             my @scripts = split /\s+/, $table->name;
11987             foreach my $script (@scripts) {
11988                 my $script_table = $scx->table($script);
11989                 $script_table += $table;
11990             }
11991             $scx->delete_match_table($table);
11992         }
11993     }
11994
11995     return;
11996 }
11997
11998 sub compile_perl() {
11999     # Create perl-defined tables.  Almost all are part of the pseudo-property
12000     # named 'perl' internally to this program.  Many of these are recommended
12001     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
12002     # on those found there.
12003     # Almost all of these are equivalent to some Unicode property.
12004     # A number of these properties have equivalents restricted to the ASCII
12005     # range, with their names prefaced by 'Posix', to signify that these match
12006     # what the Posix standard says they should match.  A couple are
12007     # effectively this, but the name doesn't have 'Posix' in it because there
12008     # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
12009     # to the full Unicode range, by our guesses as to what is appropriate.
12010
12011     # 'Any' is all code points.  As an error check, instead of just setting it
12012     # to be that, construct it to be the union of all the major categories
12013     $Any = $perl->add_match_table('Any',
12014             Description  => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
12015             Matches_All => 1);
12016
12017     foreach my $major_table ($gc->tables) {
12018
12019         # Major categories are the ones with single letter names.
12020         next if length($major_table->name) != 1;
12021
12022         $Any += $major_table;
12023     }
12024
12025     if ($Any->max != $MAX_UNICODE_CODEPOINT) {
12026         Carp::my_carp_bug("Generated highest code point ("
12027            . sprintf("%X", $Any->max)
12028            . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
12029     }
12030     if ($Any->range_count != 1 || $Any->min != 0) {
12031      Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
12032     }
12033
12034     $Any->add_alias('All');
12035
12036     # Assigned is the opposite of gc=unassigned
12037     my $Assigned = $perl->add_match_table('Assigned',
12038                                 Description  => "All assigned code points",
12039                                 Initialize => ~ $gc->table('Unassigned'),
12040                                 );
12041
12042     # Our internal-only property should be treated as more than just a
12043     # synonym; grandfather it in to the pod.
12044     $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
12045                             Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
12046             ->set_equivalent_to(property_ref('ccc')->table('Above'),
12047                                                                 Related => 1);
12048
12049     my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
12050     if (defined $block) {   # This is equivalent to the block if have it.
12051         my $Unicode_ASCII = $block->table('Basic_Latin');
12052         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
12053             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
12054         }
12055     }
12056
12057     # Very early releases didn't have blocks, so initialize ASCII ourselves if
12058     # necessary
12059     if ($ASCII->is_empty) {
12060         $ASCII->initialize([ 0..127 ]);
12061     }
12062
12063     # Get the best available case definitions.  Early Unicode versions didn't
12064     # have Uppercase and Lowercase defined, so use the general category
12065     # instead for them.
12066     my $Lower = $perl->add_match_table('Lower');
12067     my $Unicode_Lower = property_ref('Lowercase');
12068     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
12069         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
12070         $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
12071         $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
12072         $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
12073
12074     }
12075     else {
12076         $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
12077                                                                 Related => 1);
12078     }
12079     $Lower->add_alias('XPosixLower');
12080     my $Posix_Lower = $perl->add_match_table("PosixLower",
12081                             Description => "[a-z]",
12082                             Initialize => $Lower & $ASCII,
12083                             );
12084
12085     my $Upper = $perl->add_match_table('Upper');
12086     my $Unicode_Upper = property_ref('Uppercase');
12087     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
12088         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
12089         $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
12090         $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
12091         $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
12092     }
12093     else {
12094         $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
12095                                                                 Related => 1);
12096     }
12097     $Upper->add_alias('XPosixUpper');
12098     my $Posix_Upper = $perl->add_match_table("PosixUpper",
12099                             Description => "[A-Z]",
12100                             Initialize => $Upper & $ASCII,
12101                             );
12102
12103     # Earliest releases didn't have title case.  Initialize it to empty if not
12104     # otherwise present
12105     my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
12106                                        Description => '(= \p{Gc=Lt})');
12107     my $lt = $gc->table('Lt');
12108
12109     # Earlier versions of mktables had this related to $lt since they have
12110     # identical code points, but their caseless equivalents are not the same,
12111     # one being 'Cased' and the other being 'LC', and so now must be kept as
12112     # separate entities.
12113     $Title += $lt if defined $lt;
12114
12115     # If this Unicode version doesn't have Cased, set up our own.  From
12116     # Unicode 5.1: Definition D120: A character C is defined to be cased if
12117     # and only if C has the Lowercase or Uppercase property or has a
12118     # General_Category value of Titlecase_Letter.
12119     my $Unicode_Cased = property_ref('Cased');
12120     unless (defined $Unicode_Cased) {
12121         my $cased = $perl->add_match_table('Cased',
12122                         Initialize => $Lower + $Upper + $Title,
12123                         Description => 'Uppercase or Lowercase or Titlecase',
12124                         );
12125         $Unicode_Cased = $cased;
12126     }
12127     $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
12128
12129     # Similarly, set up our own Case_Ignorable property if this Unicode
12130     # version doesn't have it.  From Unicode 5.1: Definition D121: A character
12131     # C is defined to be case-ignorable if C has the value MidLetter or the
12132     # value MidNumLet for the Word_Break property or its General_Category is
12133     # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
12134     # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
12135
12136     # Perl has long had an internal-only alias for this property; grandfather
12137     # it in to the pod, but discourage its use.
12138     my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
12139                                                      Re_Pod_Entry => 1,
12140                                                      Fate => $INTERNAL_ONLY,
12141                                                      Status => $DISCOURAGED);
12142     my $case_ignorable = property_ref('Case_Ignorable');
12143     if (defined $case_ignorable && ! $case_ignorable->is_empty) {
12144         $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
12145                                                                 Related => 1);
12146     }
12147     else {
12148
12149         $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
12150
12151         # The following three properties are not in early releases
12152         $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
12153         $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
12154         $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
12155
12156         # For versions 4.1 - 5.0, there is no MidNumLet property, and
12157         # correspondingly the case-ignorable definition lacks that one.  For
12158         # 4.0, it appears that it was meant to be the same definition, but was
12159         # inadvertently omitted from the standard's text, so add it if the
12160         # property actually is there
12161         my $wb = property_ref('Word_Break');
12162         if (defined $wb) {
12163             my $midlet = $wb->table('MidLetter');
12164             $perl_case_ignorable += $midlet if defined $midlet;
12165             my $midnumlet = $wb->table('MidNumLet');
12166             $perl_case_ignorable += $midnumlet if defined $midnumlet;
12167         }
12168         else {
12169
12170             # In earlier versions of the standard, instead of the above two
12171             # properties , just the following characters were used:
12172             $perl_case_ignorable +=  0x0027  # APOSTROPHE
12173                                 +   0x00AD  # SOFT HYPHEN (SHY)
12174                                 +   0x2019; # RIGHT SINGLE QUOTATION MARK
12175         }
12176     }
12177
12178     # The remaining perl defined tables are mostly based on Unicode TR 18,
12179     # "Annex C: Compatibility Properties".  All of these have two versions,
12180     # one whose name generally begins with Posix that is posix-compliant, and
12181     # one that matches Unicode characters beyond the Posix, ASCII range
12182
12183     my $Alpha = $perl->add_match_table('Alpha');
12184
12185     # Alphabetic was not present in early releases
12186     my $Alphabetic = property_ref('Alphabetic');
12187     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
12188         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
12189     }
12190     else {
12191
12192         # For early releases, we don't get it exactly right.  The below
12193         # includes more than it should, which in 5.2 terms is: L + Nl +
12194         # Other_Alphabetic.  Other_Alphabetic contains many characters from
12195         # Mn and Mc.  It's better to match more than we should, than less than
12196         # we should.
12197         $Alpha->initialize($gc->table('Letter')
12198                             + $gc->table('Mn')
12199                             + $gc->table('Mc'));
12200         $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
12201         $Alpha->add_description('Alphabetic');
12202     }
12203     $Alpha->add_alias('XPosixAlpha');
12204     my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
12205                             Description => "[A-Za-z]",
12206                             Initialize => $Alpha & $ASCII,
12207                             );
12208     $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
12209     $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
12210
12211     my $Alnum = $perl->add_match_table('Alnum',
12212                         Description => 'Alphabetic and (decimal) Numeric',
12213                         Initialize => $Alpha + $gc->table('Decimal_Number'),
12214                         );
12215     $Alnum->add_alias('XPosixAlnum');
12216     $perl->add_match_table("PosixAlnum",
12217                             Description => "[A-Za-z0-9]",
12218                             Initialize => $Alnum & $ASCII,
12219                             );
12220
12221     my $Word = $perl->add_match_table('Word',
12222                                 Description => '\w, including beyond ASCII;'
12223                                             . ' = \p{Alnum} + \pM + \p{Pc}',
12224                                 Initialize => $Alnum + $gc->table('Mark'),
12225                                 );
12226     $Word->add_alias('XPosixWord');
12227     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
12228     $Word += $Pc if defined $Pc;
12229
12230     # This is a Perl extension, so the name doesn't begin with Posix.
12231     my $PerlWord = $perl->add_match_table('PerlWord',
12232                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
12233                     Initialize => $Word & $ASCII,
12234                     );
12235     $PerlWord->add_alias('PosixWord');
12236
12237     my $Blank = $perl->add_match_table('Blank',
12238                                 Description => '\h, Horizontal white space',
12239
12240                                 # 200B is Zero Width Space which is for line
12241                                 # break control, and was listed as
12242                                 # Space_Separator in early releases
12243                                 Initialize => $gc->table('Space_Separator')
12244                                             +   0x0009  # TAB
12245                                             -   0x200B, # ZWSP
12246                                 );
12247     $Blank->add_alias('HorizSpace');        # Another name for it.
12248     $Blank->add_alias('XPosixBlank');
12249     $perl->add_match_table("PosixBlank",
12250                             Description => "\\t and ' '",
12251                             Initialize => $Blank & $ASCII,
12252                             );
12253
12254     my $VertSpace = $perl->add_match_table('VertSpace',
12255                             Description => '\v',
12256                             Initialize => $gc->table('Line_Separator')
12257                                         + $gc->table('Paragraph_Separator')
12258                                         + 0x000A  # LINE FEED
12259                                         + 0x000B  # VERTICAL TAB
12260                                         + 0x000C  # FORM FEED
12261                                         + 0x000D  # CARRIAGE RETURN
12262                                         + 0x0085, # NEL
12263                             );
12264     # No Posix equivalent for vertical space
12265
12266     my $Space = $perl->add_match_table('Space',
12267                 Description => '\s including beyond ASCII plus vertical tab',
12268                 Initialize => $Blank + $VertSpace,
12269     );
12270     $Space->add_alias('XPosixSpace');
12271     $perl->add_match_table("PosixSpace",
12272                             Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
12273                             Initialize => $Space & $ASCII,
12274                             );
12275
12276     # Perl's traditional space doesn't include Vertical Tab
12277     my $XPerlSpace = $perl->add_match_table('XPerlSpace',
12278                                   Description => '\s, including beyond ASCII',
12279                                   Initialize => $Space - 0x000B,
12280                                 );
12281     $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
12282     my $PerlSpace = $perl->add_match_table('PerlSpace',
12283                         Description => '\s, restricted to ASCII = [ \f\n\r\t]',
12284                         Initialize => $XPerlSpace & $ASCII,
12285                             );
12286
12287
12288     my $Cntrl = $perl->add_match_table('Cntrl',
12289                                         Description => 'Control characters');
12290     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
12291     $Cntrl->add_alias('XPosixCntrl');
12292     $perl->add_match_table("PosixCntrl",
12293                             Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
12294                             Initialize => $Cntrl & $ASCII,
12295                             );
12296
12297     # $controls is a temporary used to construct Graph.
12298     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
12299                                                 + $gc->table('Control'));
12300     # Cs not in release 1
12301     $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
12302
12303     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
12304     my $Graph = $perl->add_match_table('Graph',
12305                         Description => 'Characters that are graphical',
12306                         Initialize => ~ ($Space + $controls),
12307                         );
12308     $Graph->add_alias('XPosixGraph');
12309     $perl->add_match_table("PosixGraph",
12310                             Description =>
12311                                 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
12312                             Initialize => $Graph & $ASCII,
12313                             );
12314
12315     $print = $perl->add_match_table('Print',
12316                         Description => 'Characters that are graphical plus space characters (but no controls)',
12317                         Initialize => $Blank + $Graph - $gc->table('Control'),
12318                         );
12319     $print->add_alias('XPosixPrint');
12320     $perl->add_match_table("PosixPrint",
12321                             Description =>
12322                               '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
12323                             Initialize => $print & $ASCII,
12324                             );
12325
12326     my $Punct = $perl->add_match_table('Punct');
12327     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
12328
12329     # \p{punct} doesn't include the symbols, which posix does
12330     my $XPosixPunct = $perl->add_match_table('XPosixPunct',
12331                     Description => '\p{Punct} + ASCII-range \p{Symbol}',
12332                     Initialize => $gc->table('Punctuation')
12333                                 + ($ASCII & $gc->table('Symbol')),
12334                                 Perl_Extension => 1
12335         );
12336     $perl->add_match_table('PosixPunct', Perl_Extension => 1,
12337         Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
12338         Initialize => $ASCII & $XPosixPunct,
12339         );
12340
12341     my $Digit = $perl->add_match_table('Digit',
12342                             Description => '[0-9] + all other decimal digits');
12343     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
12344     $Digit->add_alias('XPosixDigit');
12345     my $PosixDigit = $perl->add_match_table("PosixDigit",
12346                                             Description => '[0-9]',
12347                                             Initialize => $Digit & $ASCII,
12348                                             );
12349
12350     # Hex_Digit was not present in first release
12351     my $Xdigit = $perl->add_match_table('XDigit');
12352     $Xdigit->add_alias('XPosixXDigit');
12353     my $Hex = property_ref('Hex_Digit');
12354     if (defined $Hex && ! $Hex->is_empty) {
12355         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
12356     }
12357     else {
12358         # (Have to use hex instead of e.g. '0', because could be running on an
12359         # non-ASCII machine, and we want the Unicode (ASCII) values)
12360         $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
12361                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
12362         $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
12363     }
12364
12365     # AHex was not present in early releases
12366     my $PosixXDigit = $perl->add_match_table('PosixXDigit');
12367     my $AHex = property_ref('ASCII_Hex_Digit');
12368     if (defined $AHex && ! $AHex->is_empty) {
12369         $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
12370     }
12371     else {
12372         $PosixXDigit->initialize($Xdigit & $ASCII);
12373     }
12374     $PosixXDigit->add_description('[0-9A-Fa-f]');
12375
12376     my $dt = property_ref('Decomposition_Type');
12377     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
12378         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
12379         Perl_Extension => 1,
12380         Note => 'Union of all non-canonical decompositions',
12381         );
12382
12383     # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
12384     # than SD appeared, construct it ourselves, based on the first release SD
12385     # was in.  A pod entry is grandfathered in for it
12386     my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
12387                                            Perl_Extension => 1,
12388                                            Fate => $INTERNAL_ONLY,
12389                                            Status => $DISCOURAGED);
12390     my $soft_dotted = property_ref('Soft_Dotted');
12391     if (defined $soft_dotted && ! $soft_dotted->is_empty) {
12392         $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
12393     }
12394     else {
12395
12396         # This list came from 3.2 Soft_Dotted.
12397         $CanonDCIJ->initialize([ 0x0069,
12398                                  0x006A,
12399                                  0x012F,
12400                                  0x0268,
12401                                  0x0456,
12402                                  0x0458,
12403                                  0x1E2D,
12404                                  0x1ECB,
12405                                ]);
12406         $CanonDCIJ = $CanonDCIJ & $Assigned;
12407     }
12408
12409     # These are used in Unicode's definition of \X
12410     my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
12411                                        Fate => $INTERNAL_ONLY);
12412     my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
12413                                         Fate => $INTERNAL_ONLY);
12414
12415     # For backward compatibility, Perl has its own definition for IDStart
12416     # First, we include the underscore, and then the regular XID_Start also
12417     # have to be Words
12418     $perl->add_match_table('_Perl_IDStart',
12419                            Perl_Extension => 1,
12420                            Fate => $INTERNAL_ONLY,
12421                            Initialize =>
12422                              ord('_')
12423                              + (property_ref('XID_Start')->table('Y') & $Word)
12424                            );
12425
12426     my $gcb = property_ref('Grapheme_Cluster_Break');
12427
12428     # The 'extended' grapheme cluster came in 5.1.  The non-extended
12429     # definition differs too much from the traditional Perl one to use.
12430     if (defined $gcb && defined $gcb->table('SpacingMark')) {
12431
12432         # Note that assumes HST is defined; it came in an earlier release than
12433         # GCB.  In the line below, two negatives means: yes hangul
12434         $begin += ~ property_ref('Hangul_Syllable_Type')
12435                                                     ->table('Not_Applicable')
12436                + ~ ($gcb->table('Control')
12437                     + $gcb->table('CR')
12438                     + $gcb->table('LF'));
12439         $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
12440
12441         $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
12442         $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
12443     }
12444     else {    # Old definition, used on early releases.
12445         $extend += $gc->table('Mark')
12446                 + 0x200C    # ZWNJ
12447                 + 0x200D;   # ZWJ
12448         $begin += ~ $extend;
12449
12450         # Here we may have a release that has the regular grapheme cluster
12451         # defined, or a release that doesn't have anything defined.
12452         # We set things up so the Perl core degrades gracefully, possibly with
12453         # placeholders that match nothing.
12454
12455         if (! defined $gcb) {
12456             $gcb = Property->new('GCB', Status => $PLACEHOLDER);
12457         }
12458         my $hst = property_ref('HST');
12459         if (!defined $hst) {
12460             $hst = Property->new('HST', Status => $PLACEHOLDER);
12461             $hst->add_match_table('Not_Applicable',
12462                                 Initialize => $Any,
12463                                 Matches_All => 1);
12464         }
12465
12466         # On some releases, here we may not have the needed tables for the
12467         # perl core, in some releases we may.
12468         foreach my $name (qw{ L LV LVT T V prepend }) {
12469             my $table = $gcb->table($name);
12470             if (! defined $table) {
12471                 $table = $gcb->add_match_table($name);
12472                 push @tables_that_may_be_empty, $table->complete_name;
12473             }
12474
12475             # The HST property predates the GCB one, and has identical tables
12476             # for some of them, so use it if we can.
12477             if ($table->is_empty
12478                 && defined $hst
12479                 && defined $hst->table($name))
12480             {
12481                 $table += $hst->table($name);
12482             }
12483         }
12484     }
12485
12486     # More GCB.  If we found some hangul syllables, populate a combined
12487     # table.
12488     my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
12489                                           Perl_Extension => 1,
12490                                           Fate => $INTERNAL_ONLY);
12491     my $LV = $gcb->table('LV');
12492     if ($LV->is_empty) {
12493         push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
12494     } else {
12495         $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
12496         $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
12497     }
12498
12499     # Was previously constructed to contain both Name and Unicode_1_Name
12500     my @composition = ('Name', 'Unicode_1_Name');
12501
12502     if (@named_sequences) {
12503         push @composition, 'Named_Sequence';
12504         foreach my $sequence (@named_sequences) {
12505             $perl_charname->add_anomalous_entry($sequence);
12506         }
12507     }
12508
12509     my $alias_sentence = "";
12510     my $alias = property_ref('Name_Alias');
12511     if (defined $alias) {
12512         push @composition, 'Name_Alias';
12513         $perl_charname->set_proxy_for('Name_Alias');
12514         my $unicode_1 = property_ref('Unicode_1_Name');
12515         my %abbreviations;
12516
12517         # Add each entry in Name_Alias to Perl_Charnames.  Where these go with
12518         # respect to any existing entry depends on the entry type.
12519         # Corrections go before said entry, as they should be returned in
12520         # preference over the existing entry.  (A correction to a correction
12521         # should be later in the Name_Alias table, so it will correctly
12522         # precede the erroneous correction in Perl_Charnames.)
12523         #
12524         # Abbreviations go after everything else, so they are saved
12525         # temporarily in a hash for later.
12526         #
12527         # Controls are currently added afterwards.  This is because Perl has
12528         # previously used the Unicode1 name, and so should still use that.
12529         # (Most of them will be the same anyway, in which case we don't add a
12530         # duplicate)
12531
12532         $alias->reset_each_range;
12533         while (my ($range) = $alias->each_range) {
12534             next if $range->value eq "";
12535             my $code_point = $range->start;
12536             if ($code_point != $range->end) {
12537                 Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
12538             }
12539             my ($value, $type) = split ': ', $range->value;
12540             my $replace_type;
12541             if ($type eq 'correction') {
12542                 $replace_type = $MULTIPLE_BEFORE;
12543             }
12544             elsif ($type eq 'abbreviation') {
12545
12546                 # Save for later
12547                 $abbreviations{$value} = $code_point;
12548                 next;
12549             }
12550             elsif ($type eq 'control') {
12551                 my $unicode_1_value = $unicode_1->value_of($code_point);
12552                 next if $unicode_1_value eq $value;
12553                 $replace_type = $MULTIPLE_AFTER;
12554             }
12555             else {
12556                 $replace_type = $MULTIPLE_AFTER;
12557             }
12558
12559             # Actually add; before or after current entry(ies) as determined
12560             # above.
12561             $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
12562         }
12563
12564         # Now that have everything added, add in abbreviations after
12565         # everything else.
12566         foreach my $value (keys %abbreviations) {
12567             $perl_charname->add_duplicate($abbreviations{$value}, $value, Replace => $MULTIPLE_AFTER);
12568         }
12569         $alias_sentence = <<END;
12570 The Name_Alias property adds duplicate code point entries that are
12571 alternatives to the original name.  If an addition is a corrected
12572 name, it will be physically first in the table.  The original (less correct,
12573 but still valid) name will be next; then any alternatives, in no particular
12574 order; and finally any abbreviations, again in no particular order.
12575 END
12576     }
12577
12578     my $comment;
12579     if (@composition <= 2) { # Always at least 2
12580         $comment = join " and ", @composition;
12581     }
12582     else {
12583         $comment = join ", ", @composition[0 .. scalar @composition - 2];
12584         $comment .= ", and $composition[-1]";
12585     }
12586
12587     $perl_charname->add_comment(join_lines( <<END
12588 This file is for charnames.pm.  It is the union of the $comment properties.
12589 Unicode_1_Name entries are used only for nameless code points in the Name
12590 property.
12591 $alias_sentence
12592 This file doesn't include the algorithmically determinable names.  For those,
12593 use 'unicore/Name.pm'
12594 END
12595     ));
12596     property_ref('Name')->add_comment(join_lines( <<END
12597 This file doesn't include the algorithmically determinable names.  For those,
12598 use 'unicore/Name.pm'
12599 END
12600     ));
12601
12602     # Construct the Present_In property from the Age property.
12603     if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
12604         my $default_map = $age->default_map;
12605         my $in = Property->new('In',
12606                                 Default_Map => $default_map,
12607                                 Full_Name => "Present_In",
12608                                 Perl_Extension => 1,
12609                                 Type => $ENUM,
12610                                 Initialize => $age,
12611                                 );
12612         $in->add_comment(join_lines(<<END
12613 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
12614 same as for $age, and not for what $in really means.  This is because anything
12615 defined in a given release should have multiple values: that release and all
12616 higher ones.  But only one value per code point can be represented in a table
12617 like this.
12618 END
12619         ));
12620
12621         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
12622         # lowest numbered (earliest) come first, with the non-numeric one
12623         # last.
12624         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
12625                                             ? 1
12626                                             : ($b->name !~ /^[\d.]*$/)
12627                                                 ? -1
12628                                                 : $a->name <=> $b->name
12629                                             } $age->tables;
12630
12631         # The Present_In property is the cumulative age properties.  The first
12632         # one hence is identical to the first age one.
12633         my $previous_in = $in->add_match_table($first_age->name);
12634         $previous_in->set_equivalent_to($first_age, Related => 1);
12635
12636         my $description_start = "Code point's usage introduced in version ";
12637         $first_age->add_description($description_start . $first_age->name);
12638
12639         # To construct the accumulated values, for each of the age tables
12640         # starting with the 2nd earliest, merge the earliest with it, to get
12641         # all those code points existing in the 2nd earliest.  Repeat merging
12642         # the new 2nd earliest with the 3rd earliest to get all those existing
12643         # in the 3rd earliest, and so on.
12644         foreach my $current_age (@rest_ages) {
12645             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
12646
12647             my $current_in = $in->add_match_table(
12648                                     $current_age->name,
12649                                     Initialize => $current_age + $previous_in,
12650                                     Description => $description_start
12651                                                     . $current_age->name
12652                                                     . ' or earlier',
12653                                     );
12654             $previous_in = $current_in;
12655
12656             # Add clarifying material for the corresponding age file.  This is
12657             # in part because of the confusing and contradictory information
12658             # given in the Standard's documentation itself, as of 5.2.
12659             $current_age->add_description(
12660                             "Code point's usage was introduced in version "
12661                             . $current_age->name);
12662             $current_age->add_note("See also $in");
12663
12664         }
12665
12666         # And finally the code points whose usages have yet to be decided are
12667         # the same in both properties.  Note that permanently unassigned code
12668         # points actually have their usage assigned (as being permanently
12669         # unassigned), so that these tables are not the same as gc=cn.
12670         my $unassigned = $in->add_match_table($default_map);
12671         my $age_default = $age->table($default_map);
12672         $age_default->add_description(<<END
12673 Code point's usage has not been assigned in any Unicode release thus far.
12674 END
12675         );
12676         $unassigned->set_equivalent_to($age_default, Related => 1);
12677     }
12678
12679
12680     # Finished creating all the perl properties.  All non-internal non-string
12681     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
12682     # an underscore.)  These do not get a separate entry in the pod file
12683     foreach my $table ($perl->tables) {
12684         foreach my $alias ($table->aliases) {
12685             next if $alias->name =~ /^_/;
12686             $table->add_alias('Is_' . $alias->name,
12687                                Re_Pod_Entry => 0,
12688                                UCD => 0,
12689                                Status => $alias->status,
12690                                OK_as_Filename => 0);
12691         }
12692     }
12693
12694     # Here done with all the basic stuff.  Ready to populate the information
12695     # about each character if annotating them.
12696     if ($annotate) {
12697
12698         # See comments at its declaration
12699         $annotate_ranges = Range_Map->new;
12700
12701         # This separates out the non-characters from the other unassigneds, so
12702         # can give different annotations for each.
12703         $unassigned_sans_noncharacters = Range_List->new(
12704          Initialize => $gc->table('Unassigned')
12705                        & property_ref('Noncharacter_Code_Point')->table('N'));
12706
12707         for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
12708             $i = populate_char_info($i);    # Note sets $i so may cause skips
12709         }
12710     }
12711
12712     return;
12713 }
12714
12715 sub add_perl_synonyms() {
12716     # A number of Unicode tables have Perl synonyms that are expressed in
12717     # the single-form, \p{name}.  These are:
12718     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
12719     #       \p{Is_Name} as synonyms
12720     #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
12721     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
12722     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
12723     #       conflict, \p{Value} and \p{Is_Value} as well
12724     #
12725     # This routine generates these synonyms, warning of any unexpected
12726     # conflicts.
12727
12728     # Construct the list of tables to get synonyms for.  Start with all the
12729     # binary and the General_Category ones.
12730     my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
12731                                                             property_ref('*');
12732     push @tables, $gc->tables;
12733
12734     # If the version of Unicode includes the Script property, add its tables
12735     push @tables, $script->tables if defined $script;
12736
12737     # The Block tables are kept separate because they are treated differently.
12738     # And the earliest versions of Unicode didn't include them, so add only if
12739     # there are some.
12740     my @blocks;
12741     push @blocks, $block->tables if defined $block;
12742
12743     # Here, have the lists of tables constructed.  Process blocks last so that
12744     # if there are name collisions with them, blocks have lowest priority.
12745     # Should there ever be other collisions, manual intervention would be
12746     # required.  See the comments at the beginning of the program for a
12747     # possible way to handle those semi-automatically.
12748     foreach my $table (@tables,  @blocks) {
12749
12750         # For non-binary properties, the synonym is just the name of the
12751         # table, like Greek, but for binary properties the synonym is the name
12752         # of the property, and means the code points in its 'Y' table.
12753         my $nominal = $table;
12754         my $nominal_property = $nominal->property;
12755         my $actual;
12756         if (! $nominal->isa('Property')) {
12757             $actual = $table;
12758         }
12759         else {
12760
12761             # Here is a binary property.  Use the 'Y' table.  Verify that is
12762             # there
12763             my $yes = $nominal->table('Y');
12764             unless (defined $yes) {  # Must be defined, but is permissible to
12765                                      # be empty.
12766                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
12767                 next;
12768             }
12769             $actual = $yes;
12770         }
12771
12772         foreach my $alias ($nominal->aliases) {
12773
12774             # Attempt to create a table in the perl directory for the
12775             # candidate table, using whatever aliases in it that don't
12776             # conflict.  Also add non-conflicting aliases for all these
12777             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
12778             PREFIX:
12779             foreach my $prefix ("", 'Is_', 'In_') {
12780
12781                 # Only Block properties can have added 'In_' aliases.
12782                 next if $prefix eq 'In_' and $nominal_property != $block;
12783
12784                 my $proposed_name = $prefix . $alias->name;
12785
12786                 # No Is_Is, In_In, nor combinations thereof
12787                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
12788                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
12789
12790                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
12791
12792                 # Get a reference to any existing table in the perl
12793                 # directory with the desired name.
12794                 my $pre_existing = $perl->table($proposed_name);
12795
12796                 if (! defined $pre_existing) {
12797
12798                     # No name collision, so ok to add the perl synonym.
12799
12800                     my $make_re_pod_entry;
12801                     my $ok_as_filename;
12802                     my $status = $alias->status;
12803                     if ($nominal_property == $block) {
12804
12805                         # For block properties, the 'In' form is preferred for
12806                         # external use; the pod file contains wild cards for
12807                         # this and the 'Is' form so no entries for those; and
12808                         # we don't want people using the name without the
12809                         # 'In', so discourage that.
12810                         if ($prefix eq "") {
12811                             $make_re_pod_entry = 1;
12812                             $status = $status || $DISCOURAGED;
12813                             $ok_as_filename = 0;
12814                         }
12815                         elsif ($prefix eq 'In_') {
12816                             $make_re_pod_entry = 0;
12817                             $status = $status || $NORMAL;
12818                             $ok_as_filename = 1;
12819                         }
12820                         else {
12821                             $make_re_pod_entry = 0;
12822                             $status = $status || $DISCOURAGED;
12823                             $ok_as_filename = 0;
12824                         }
12825                     }
12826                     elsif ($prefix ne "") {
12827
12828                         # The 'Is' prefix is handled in the pod by a wild
12829                         # card, and we won't use it for an external name
12830                         $make_re_pod_entry = 0;
12831                         $status = $status || $NORMAL;
12832                         $ok_as_filename = 0;
12833                     }
12834                     else {
12835
12836                         # Here, is an empty prefix, non block.  This gets its
12837                         # own pod entry and can be used for an external name.
12838                         $make_re_pod_entry = 1;
12839                         $status = $status || $NORMAL;
12840                         $ok_as_filename = 1;
12841                     }
12842
12843                     # Here, there isn't a perl pre-existing table with the
12844                     # name.  Look through the list of equivalents of this
12845                     # table to see if one is a perl table.
12846                     foreach my $equivalent ($actual->leader->equivalents) {
12847                         next if $equivalent->property != $perl;
12848
12849                         # Here, have found a table for $perl.  Add this alias
12850                         # to it, and are done with this prefix.
12851                         $equivalent->add_alias($proposed_name,
12852                                         Re_Pod_Entry => $make_re_pod_entry,
12853
12854                                         # Currently don't output these in the
12855                                         # ucd pod, as are strongly discouraged
12856                                         # from being used
12857                                         UCD => 0,
12858
12859                                         Status => $status,
12860                                         OK_as_Filename => $ok_as_filename);
12861                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
12862                         next PREFIX;
12863                     }
12864
12865                     # Here, $perl doesn't already have a table that is a
12866                     # synonym for this property, add one.
12867                     my $added_table = $perl->add_match_table($proposed_name,
12868                                             Re_Pod_Entry => $make_re_pod_entry,
12869
12870                                             # See UCD comment just above
12871                                             UCD => 0,
12872
12873                                             Status => $status,
12874                                             OK_as_Filename => $ok_as_filename);
12875                     # And it will be related to the actual table, since it is
12876                     # based on it.
12877                     $added_table->set_equivalent_to($actual, Related => 1);
12878                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
12879                     next;
12880                 } # End of no pre-existing.
12881
12882                 # Here, there is a pre-existing table that has the proposed
12883                 # name.  We could be in trouble, but not if this is just a
12884                 # synonym for another table that we have already made a child
12885                 # of the pre-existing one.
12886                 if ($pre_existing->is_set_equivalent_to($actual)) {
12887                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
12888                     $pre_existing->add_alias($proposed_name);
12889                     next;
12890                 }
12891
12892                 # Here, there is a name collision, but it still could be ok if
12893                 # the tables match the identical set of code points, in which
12894                 # case, we can combine the names.  Compare each table's code
12895                 # point list to see if they are identical.
12896                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
12897                 if ($pre_existing->matches_identically_to($actual)) {
12898
12899                     # Here, they do match identically.  Not a real conflict.
12900                     # Make the perl version a child of the Unicode one, except
12901                     # in the non-obvious case of where the perl name is
12902                     # already a synonym of another Unicode property.  (This is
12903                     # excluded by the test for it being its own parent.)  The
12904                     # reason for this exclusion is that then the two Unicode
12905                     # properties become related; and we don't really know if
12906                     # they are or not.  We generate documentation based on
12907                     # relatedness, and this would be misleading.  Code
12908                     # later executed in the process will cause the tables to
12909                     # be represented by a single file anyway, without making
12910                     # it look in the pod like they are necessarily related.
12911                     if ($pre_existing->parent == $pre_existing
12912                         && ($pre_existing->property == $perl
12913                             || $actual->property == $perl))
12914                     {
12915                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
12916                         $pre_existing->set_equivalent_to($actual, Related => 1);
12917                     }
12918                     elsif (main::DEBUG && $to_trace) {
12919                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
12920                         trace $pre_existing->parent;
12921                     }
12922                     next PREFIX;
12923                 }
12924
12925                 # Here they didn't match identically, there is a real conflict
12926                 # between our new name and a pre-existing property.
12927                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
12928                 $pre_existing->add_conflicting($nominal->full_name,
12929                                                'p',
12930                                                $actual);
12931
12932                 # Don't output a warning for aliases for the block
12933                 # properties (unless they start with 'In_') as it is
12934                 # expected that there will be conflicts and the block
12935                 # form loses.
12936                 if ($verbosity >= $NORMAL_VERBOSITY
12937                     && ($actual->property != $block || $prefix eq 'In_'))
12938                 {
12939                     print simple_fold(join_lines(<<END
12940 There is already an alias named $proposed_name (from " . $pre_existing . "),
12941 so not creating this alias for " . $actual
12942 END
12943                     ), "", 4);
12944                 }
12945
12946                 # Keep track for documentation purposes.
12947                 $has_In_conflicts++ if $prefix eq 'In_';
12948                 $has_Is_conflicts++ if $prefix eq 'Is_';
12949             }
12950         }
12951     }
12952
12953     # There are some properties which have No and Yes (and N and Y) as
12954     # property values, but aren't binary, and could possibly be confused with
12955     # binary ones.  So create caveats for them.  There are tables that are
12956     # named 'No', and tables that are named 'N', but confusion is not likely
12957     # unless they are the same table.  For example, N meaning Number or
12958     # Neutral is not likely to cause confusion, so don't add caveats to things
12959     # like them.
12960     foreach my $property (grep { $_->type != $BINARY
12961                                  && $_->type != $FORCED_BINARY }
12962                                                             property_ref('*'))
12963     {
12964         my $yes = $property->table('Yes');
12965         if (defined $yes) {
12966             my $y = $property->table('Y');
12967             if (defined $y && $yes == $y) {
12968                 foreach my $alias ($property->aliases) {
12969                     $yes->add_conflicting($alias->name);
12970                 }
12971             }
12972         }
12973         my $no = $property->table('No');
12974         if (defined $no) {
12975             my $n = $property->table('N');
12976             if (defined $n && $no == $n) {
12977                 foreach my $alias ($property->aliases) {
12978                     $no->add_conflicting($alias->name, 'P');
12979                 }
12980             }
12981         }
12982     }
12983
12984     return;
12985 }
12986
12987 sub register_file_for_name($$$) {
12988     # Given info about a table and a datafile that it should be associated
12989     # with, register that association
12990
12991     my $table = shift;
12992     my $directory_ref = shift;   # Array of the directory path for the file
12993     my $file = shift;            # The file name in the final directory.
12994     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12995
12996     trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
12997
12998     if ($table->isa('Property')) {
12999         $table->set_file_path(@$directory_ref, $file);
13000         push @map_properties, $table;
13001
13002         # No swash means don't do the rest of this.
13003         return if $table->fate != $ORDINARY;
13004
13005         # Get the path to the file
13006         my @path = $table->file_path;
13007
13008         # Use just the file name if no subdirectory.
13009         shift @path if $path[0] eq File::Spec->curdir();
13010
13011         my $file = join '/', @path;
13012
13013         # Create a hash entry for utf8_heavy to get the file that stores this
13014         # property's map table
13015         foreach my $alias ($table->aliases) {
13016             my $name = $alias->name;
13017             $loose_property_to_file_of{standardize($name)} = $file;
13018         }
13019
13020         # And a way for utf8_heavy to find the proper key in the SwashInfo
13021         # hash for this property.
13022         $file_to_swash_name{$file} = "To" . $table->swash_name;
13023         return;
13024     }
13025
13026     # Do all of the work for all equivalent tables when called with the leader
13027     # table, so skip if isn't the leader.
13028     return if $table->leader != $table;
13029
13030     # If this is a complement of another file, use that other file instead,
13031     # with a ! prepended to it.
13032     my $complement;
13033     if (($complement = $table->complement) != 0) {
13034         my @directories = $complement->file_path;
13035
13036         # This assumes that the 0th element is something like 'lib',
13037         # the 1th element the property name (in its own directory), like
13038         # 'AHex', and the 2th element the file like 'Y' which will have a .pl
13039         # appended to it later.
13040         $directories[1] =~ s/^/!/;
13041         $file = pop @directories;
13042         $directory_ref =\@directories;
13043     }
13044
13045     # Join all the file path components together, using slashes.
13046     my $full_filename = join('/', @$directory_ref, $file);
13047
13048     # All go in the same subdirectory of unicore
13049     if ($directory_ref->[0] ne $matches_directory) {
13050         Carp::my_carp("Unexpected directory in "
13051                 .  join('/', @{$directory_ref}, $file));
13052     }
13053
13054     # For this table and all its equivalents ...
13055     foreach my $table ($table, $table->equivalents) {
13056
13057         # Associate it with its file internally.  Don't include the
13058         # $matches_directory first component
13059         $table->set_file_path(@$directory_ref, $file);
13060
13061         # No swash means don't do the rest of this.
13062         next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
13063
13064         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
13065
13066         my $property = $table->property;
13067         my $property_name = ($property == $perl)
13068                              ? ""  # 'perl' is never explicitly stated
13069                              : standardize($property->name) . '=';
13070
13071         my $is_default = 0; # Is this table the default one for the property?
13072
13073         # To calculate $is_default, we find if this table is the same as the
13074         # default one for the property.  But this is complicated by the
13075         # possibility that there is a master table for this one, and the
13076         # information is stored there instead of here.
13077         my $parent = $table->parent;
13078         my $leader_prop = $parent->property;
13079         my $default_map = $leader_prop->default_map;
13080         if (defined $default_map) {
13081             my $default_table = $leader_prop->table($default_map);
13082             $is_default = 1 if defined $default_table && $parent == $default_table;
13083         }
13084
13085         # Calculate the loose name for this table.  Mostly it's just its name,
13086         # standardized.  But in the case of Perl tables that are single-form
13087         # equivalents to Unicode properties, it is the latter's name.
13088         my $loose_table_name =
13089                         ($property != $perl || $leader_prop == $perl)
13090                         ? standardize($table->name)
13091                         : standardize($parent->name);
13092
13093         my $deprecated = ($table->status eq $DEPRECATED)
13094                          ? $table->status_info
13095                          : "";
13096         my $caseless_equivalent = $table->caseless_equivalent;
13097
13098         # And for each of the table's aliases...  This inner loop eventually
13099         # goes through all aliases in the UCD that we generate regex match
13100         # files for
13101         foreach my $alias ($table->aliases) {
13102             my $standard = utf8_heavy_name($table, $alias);
13103
13104             # Generate an entry in either the loose or strict hashes, which
13105             # will translate the property and alias names combination into the
13106             # file where the table for them is stored.
13107             if ($alias->loose_match) {
13108                 if (exists $loose_to_file_of{$standard}) {
13109                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
13110                 }
13111                 else {
13112                     $loose_to_file_of{$standard} = $sub_filename;
13113                 }
13114             }
13115             else {
13116                 if (exists $stricter_to_file_of{$standard}) {
13117                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
13118                 }
13119                 else {
13120                     $stricter_to_file_of{$standard} = $sub_filename;
13121
13122                     # Tightly coupled with how utf8_heavy.pl works, for a
13123                     # floating point number that is a whole number, get rid of
13124                     # the trailing decimal point and 0's, so that utf8_heavy
13125                     # will work.  Also note that this assumes that such a
13126                     # number is matched strictly; so if that were to change,
13127                     # this would be wrong.
13128                     if ((my $integer_name = $alias->name)
13129                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
13130                     {
13131                         $stricter_to_file_of{$property_name . $integer_name}
13132                                                             = $sub_filename;
13133                     }
13134                 }
13135             }
13136
13137             # For Unicode::UCD, create a mapping of the prop=value to the
13138             # canonical =value for that property.
13139             if ($standard =~ /=/) {
13140
13141                 # This could happen if a strict name mapped into an existing
13142                 # loose name.  In that event, the strict names would have to
13143                 # be moved to a new hash.
13144                 if (exists($loose_to_standard_value{$standard})) {
13145                     Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
13146                 }
13147                 $loose_to_standard_value{$standard} = $loose_table_name;
13148             }
13149
13150             # Keep a list of the deprecated properties and their filenames
13151             if ($deprecated && $complement == 0) {
13152                 $utf8::why_deprecated{$sub_filename} = $deprecated;
13153             }
13154
13155             # And a substitute table, if any, for case-insensitive matching
13156             if ($caseless_equivalent != 0) {
13157                 $caseless_equivalent_to{$standard} = $caseless_equivalent;
13158             }
13159
13160             # Add to defaults list if the table this alias belongs to is the
13161             # default one
13162             $loose_defaults{$standard} = 1 if $is_default;
13163         }
13164     }
13165
13166     return;
13167 }
13168
13169 {   # Closure
13170     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
13171                      # conflicts
13172     my %full_dir_name_of;   # Full length names of directories used.
13173
13174     sub construct_filename($$$) {
13175         # Return a file name for a table, based on the table name, but perhaps
13176         # changed to get rid of non-portable characters in it, and to make
13177         # sure that it is unique on a file system that allows the names before
13178         # any period to be at most 8 characters (DOS).  While we're at it
13179         # check and complain if there are any directory conflicts.
13180
13181         my $name = shift;       # The name to start with
13182         my $mutable = shift;    # Boolean: can it be changed?  If no, but
13183                                 # yet it must be to work properly, a warning
13184                                 # is given
13185         my $directories_ref = shift;  # A reference to an array containing the
13186                                 # path to the file, with each element one path
13187                                 # component.  This is used because the same
13188                                 # name can be used in different directories.
13189         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13190
13191         my $warn = ! defined wantarray;  # If true, then if the name is
13192                                 # changed, a warning is issued as well.
13193
13194         if (! defined $name) {
13195             Carp::my_carp("Undefined name in directory "
13196                           . File::Spec->join(@$directories_ref)
13197                           . ". '_' used");
13198             return '_';
13199         }
13200
13201         # Make sure that no directory names conflict with each other.  Look at
13202         # each directory in the input file's path.  If it is already in use,
13203         # assume it is correct, and is merely being re-used, but if we
13204         # truncate it to 8 characters, and find that there are two directories
13205         # that are the same for the first 8 characters, but differ after that,
13206         # then that is a problem.
13207         foreach my $directory (@$directories_ref) {
13208             my $short_dir = substr($directory, 0, 8);
13209             if (defined $full_dir_name_of{$short_dir}) {
13210                 next if $full_dir_name_of{$short_dir} eq $directory;
13211                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
13212             }
13213             else {
13214                 $full_dir_name_of{$short_dir} = $directory;
13215             }
13216         }
13217
13218         my $path = join '/', @$directories_ref;
13219         $path .= '/' if $path;
13220
13221         # Remove interior underscores.
13222         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
13223
13224         # Change any non-word character into an underscore, and truncate to 8.
13225         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
13226         substr($filename, 8) = "" if length($filename) > 8;
13227
13228         # Make sure the basename doesn't conflict with something we
13229         # might have already written. If we have, say,
13230         #     InGreekExtended1
13231         #     InGreekExtended2
13232         # they become
13233         #     InGreekE
13234         #     InGreek2
13235         my $warned = 0;
13236         while (my $num = $base_names{$path}{lc $filename}++) {
13237             $num++; # so basenames with numbers start with '2', which
13238                     # just looks more natural.
13239
13240             # Want to append $num, but if it'll make the basename longer
13241             # than 8 characters, pre-truncate $filename so that the result
13242             # is acceptable.
13243             my $delta = length($filename) + length($num) - 8;
13244             if ($delta > 0) {
13245                 substr($filename, -$delta) = $num;
13246             }
13247             else {
13248                 $filename .= $num;
13249             }
13250             if ($warn && ! $warned) {
13251                 $warned = 1;
13252                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
13253             }
13254         }
13255
13256         return $filename if $mutable;
13257
13258         # If not changeable, must return the input name, but warn if needed to
13259         # change it beyond shortening it.
13260         if ($name ne $filename
13261             && substr($name, 0, length($filename)) ne $filename) {
13262             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
13263         }
13264         return $name;
13265     }
13266 }
13267
13268 # The pod file contains a very large table.  Many of the lines in that table
13269 # would exceed a typical output window's size, and so need to be wrapped with
13270 # a hanging indent to make them look good.  The pod language is really
13271 # insufficient here.  There is no general construct to do that in pod, so it
13272 # is done here by beginning each such line with a space to cause the result to
13273 # be output without formatting, and doing all the formatting here.  This leads
13274 # to the result that if the eventual display window is too narrow it won't
13275 # look good, and if the window is too wide, no advantage is taken of that
13276 # extra width.  A further complication is that the output may be indented by
13277 # the formatter so that there is less space than expected.  What I (khw) have
13278 # done is to assume that that indent is a particular number of spaces based on
13279 # what it is in my Linux system;  people can always resize their windows if
13280 # necessary, but this is obviously less than desirable, but the best that can
13281 # be expected.
13282 my $automatic_pod_indent = 8;
13283
13284 # Try to format so that uses fewest lines, but few long left column entries
13285 # slide into the right column.  An experiment on 5.1 data yielded the
13286 # following percentages that didn't cut into the other side along with the
13287 # associated first-column widths
13288 # 69% = 24
13289 # 80% not too bad except for a few blocks
13290 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
13291 # 95% = 37;
13292 my $indent_info_column = 27;    # 75% of lines didn't have overlap
13293
13294 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
13295                     # The 3 is because of:
13296                     #   1   for the leading space to tell the pod formatter to
13297                     #       output as-is
13298                     #   1   for the flag
13299                     #   1   for the space between the flag and the main data
13300
13301 sub format_pod_line ($$$;$$) {
13302     # Take a pod line and return it, formatted properly
13303
13304     my $first_column_width = shift;
13305     my $entry = shift;  # Contents of left column
13306     my $info = shift;   # Contents of right column
13307
13308     my $status = shift || "";   # Any flag
13309
13310     my $loose_match = shift;    # Boolean.
13311     $loose_match = 1 unless defined $loose_match;
13312
13313     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13314
13315     my $flags = "";
13316     $flags .= $STRICTER if ! $loose_match;
13317
13318     $flags .= $status if $status;
13319
13320     # There is a blank in the left column to cause the pod formatter to
13321     # output the line as-is.
13322     return sprintf " %-*s%-*s %s\n",
13323                     # The first * in the format is replaced by this, the -1 is
13324                     # to account for the leading blank.  There isn't a
13325                     # hard-coded blank after this to separate the flags from
13326                     # the rest of the line, so that in the unlikely event that
13327                     # multiple flags are shown on the same line, they both
13328                     # will get displayed at the expense of that separation,
13329                     # but since they are left justified, a blank will be
13330                     # inserted in the normal case.
13331                     $FILLER - 1,
13332                     $flags,
13333
13334                     # The other * in the format is replaced by this number to
13335                     # cause the first main column to right fill with blanks.
13336                     # The -1 is for the guaranteed blank following it.
13337                     $first_column_width - $FILLER - 1,
13338                     $entry,
13339                     $info;
13340 }
13341
13342 my @zero_match_tables;  # List of tables that have no matches in this release
13343
13344 sub make_re_pod_entries($) {
13345     # This generates the entries for the pod file for a given table.
13346     # Also done at this time are any children tables.  The output looks like:
13347     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
13348
13349     my $input_table = shift;        # Table the entry is for
13350     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13351
13352     # Generate parent and all its children at the same time.
13353     return if $input_table->parent != $input_table;
13354
13355     my $property = $input_table->property;
13356     my $type = $property->type;
13357     my $full_name = $property->full_name;
13358
13359     my $count = $input_table->count;
13360     my $string_count = clarify_number($count);
13361     my $status = $input_table->status;
13362     my $status_info = $input_table->status_info;
13363     my $caseless_equivalent = $input_table->caseless_equivalent;
13364
13365     my $entry_for_first_table; # The entry for the first table output.
13366                            # Almost certainly, it is the parent.
13367
13368     # For each related table (including itself), we will generate a pod entry
13369     # for each name each table goes by
13370     foreach my $table ($input_table, $input_table->children) {
13371
13372         # utf8_heavy.pl cannot deal with null string property values, so skip
13373         # any tables that have no non-null names.
13374         next if ! grep { $_->name ne "" } $table->aliases;
13375
13376         # First, gather all the info that applies to this table as a whole.
13377
13378         push @zero_match_tables, $table if $count == 0;
13379
13380         my $table_property = $table->property;
13381
13382         # The short name has all the underscores removed, while the full name
13383         # retains them.  Later, we decide whether to output a short synonym
13384         # for the full one, we need to compare apples to apples, so we use the
13385         # short name's length including underscores.
13386         my $table_property_short_name_length;
13387         my $table_property_short_name
13388             = $table_property->short_name(\$table_property_short_name_length);
13389         my $table_property_full_name = $table_property->full_name;
13390
13391         # Get how much savings there is in the short name over the full one
13392         # (delta will always be <= 0)
13393         my $table_property_short_delta = $table_property_short_name_length
13394                                          - length($table_property_full_name);
13395         my @table_description = $table->description;
13396         my @table_note = $table->note;
13397
13398         # Generate an entry for each alias in this table.
13399         my $entry_for_first_alias;  # saves the first one encountered.
13400         foreach my $alias ($table->aliases) {
13401
13402             # Skip if not to go in pod.
13403             next unless $alias->make_re_pod_entry;
13404
13405             # Start gathering all the components for the entry
13406             my $name = $alias->name;
13407
13408             # Skip if name is empty, as can't be accessed by regexes.
13409             next if $name eq "";
13410
13411             my $entry;      # Holds the left column, may include extras
13412             my $entry_ref;  # To refer to the left column's contents from
13413                             # another entry; has no extras
13414
13415             # First the left column of the pod entry.  Tables for the $perl
13416             # property always use the single form.
13417             if ($table_property == $perl) {
13418                 $entry = "\\p{$name}";
13419                 $entry_ref = "\\p{$name}";
13420             }
13421             else {    # Compound form.
13422
13423                 # Only generate one entry for all the aliases that mean true
13424                 # or false in binary properties.  Append a '*' to indicate
13425                 # some are missing.  (The heading comment notes this.)
13426                 my $rhs;
13427                 if ($type == $BINARY) {
13428                     next if $name ne 'N' && $name ne 'Y';
13429                     $rhs = "$name*";
13430                 }
13431                 elsif ($type != $FORCED_BINARY) {
13432                     $rhs = $name;
13433                 }
13434                 else {
13435
13436                     # Forced binary properties require special handling.  It
13437                     # has two sets of tables, one set is true/false; and the
13438                     # other set is everything else.  Entries are generated for
13439                     # each set.  Use the Bidi_Mirrored property (which appears
13440                     # in all Unicode versions) to get a list of the aliases
13441                     # for the true/false tables.  Of these, only output the N
13442                     # and Y ones, the same as, a regular binary property.  And
13443                     # output all the rest, same as a non-binary property.
13444                     my $bm = property_ref("Bidi_Mirrored");
13445                     if ($name eq 'N' || $name eq 'Y') {
13446                         $rhs = "$name*";
13447                     } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
13448                                                         $bm->table("N")->aliases)
13449                     {
13450                         next;
13451                     }
13452                     else {
13453                         $rhs = $name;
13454                     }
13455                 }
13456
13457                 # Colon-space is used to give a little more space to be easier
13458                 # to read;
13459                 $entry = "\\p{"
13460                         . $table_property_full_name
13461                         . ": $rhs}";
13462
13463                 # But for the reference to this entry, which will go in the
13464                 # right column, where space is at a premium, use equals
13465                 # without a space
13466                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
13467             }
13468
13469             # Then the right (info) column.  This is stored as components of
13470             # an array for the moment, then joined into a string later.  For
13471             # non-internal only properties, begin the info with the entry for
13472             # the first table we encountered (if any), as things are ordered
13473             # so that that one is the most descriptive.  This leads to the
13474             # info column of an entry being a more descriptive version of the
13475             # name column
13476             my @info;
13477             if ($name =~ /^_/) {
13478                 push @info,
13479                         '(For internal use by Perl, not necessarily stable)';
13480             }
13481             elsif ($entry_for_first_alias) {
13482                 push @info, $entry_for_first_alias;
13483             }
13484
13485             # If this entry is equivalent to another, add that to the info,
13486             # using the first such table we encountered
13487             if ($entry_for_first_table) {
13488                 if (@info) {
13489                     push @info, "(= $entry_for_first_table)";
13490                 }
13491                 else {
13492                     push @info, $entry_for_first_table;
13493                 }
13494             }
13495
13496             # If the name is a large integer, add an equivalent with an
13497             # exponent for better readability
13498             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
13499                 push @info, sprintf "(= %.1e)", $name
13500             }
13501
13502             my $parenthesized = "";
13503             if (! $entry_for_first_alias) {
13504
13505                 # This is the first alias for the current table.  The alias
13506                 # array is ordered so that this is the fullest, most
13507                 # descriptive alias, so it gets the fullest info.  The other
13508                 # aliases are mostly merely pointers to this one, using the
13509                 # information already added above.
13510
13511                 # Display any status message, but only on the parent table
13512                 if ($status && ! $entry_for_first_table) {
13513                     push @info, $status_info;
13514                 }
13515
13516                 # Put out any descriptive info
13517                 if (@table_description || @table_note) {
13518                     push @info, join "; ", @table_description, @table_note;
13519                 }
13520
13521                 # Look to see if there is a shorter name we can point people
13522                 # at
13523                 my $standard_name = standardize($name);
13524                 my $short_name;
13525                 my $proposed_short = $table->short_name;
13526                 if (defined $proposed_short) {
13527                     my $standard_short = standardize($proposed_short);
13528
13529                     # If the short name is shorter than the standard one, or
13530                     # even it it's not, but the combination of it and its
13531                     # short property name (as in \p{prop=short} ($perl doesn't
13532                     # have this form)) saves at least two characters, then,
13533                     # cause it to be listed as a shorter synonym.
13534                     if (length $standard_short < length $standard_name
13535                         || ($table_property != $perl
13536                             && (length($standard_short)
13537                                 - length($standard_name)
13538                                 + $table_property_short_delta)  # (<= 0)
13539                                 < -2))
13540                     {
13541                         $short_name = $proposed_short;
13542                         if ($table_property != $perl) {
13543                             $short_name = $table_property_short_name
13544                                           . "=$short_name";
13545                         }
13546                         $short_name = "\\p{$short_name}";
13547                     }
13548                 }
13549
13550                 # And if this is a compound form name, see if there is a
13551                 # single form equivalent
13552                 my $single_form;
13553                 if ($table_property != $perl) {
13554
13555                     # Special case the binary N tables, so that will print
13556                     # \P{single}, but use the Y table values to populate
13557                     # 'single', as we haven't likewise populated the N table.
13558                     # For forced binary tables, we can't just look at the N
13559                     # table, but must see if this table is equivalent to the N
13560                     # one, as there are two equivalent beasts in these
13561                     # properties.
13562                     my $test_table;
13563                     my $p;
13564                     if (   ($type == $BINARY
13565                             && $input_table == $property->table('No'))
13566                         || ($type == $FORCED_BINARY
13567                             && $property->table('No')->
13568                                         is_set_equivalent_to($input_table)))
13569                     {
13570                         $test_table = $property->table('Yes');
13571                         $p = 'P';
13572                     }
13573                     else {
13574                         $test_table = $input_table;
13575                         $p = 'p';
13576                     }
13577
13578                     # Look for a single form amongst all the children.
13579                     foreach my $table ($test_table->children) {
13580                         next if $table->property != $perl;
13581                         my $proposed_name = $table->short_name;
13582                         next if ! defined $proposed_name;
13583
13584                         # Don't mention internal-only properties as a possible
13585                         # single form synonym
13586                         next if substr($proposed_name, 0, 1) eq '_';
13587
13588                         $proposed_name = "\\$p\{$proposed_name}";
13589                         if (! defined $single_form
13590                             || length($proposed_name) < length $single_form)
13591                         {
13592                             $single_form = $proposed_name;
13593
13594                             # The goal here is to find a single form; not the
13595                             # shortest possible one.  We've already found a
13596                             # short name.  So, stop at the first single form
13597                             # found, which is likely to be closer to the
13598                             # original.
13599                             last;
13600                         }
13601                     }
13602                 }
13603
13604                 # Ouput both short and single in the same parenthesized
13605                 # expression, but with only one of 'Single', 'Short' if there
13606                 # are both items.
13607                 if ($short_name || $single_form || $table->conflicting) {
13608                     $parenthesized .= "Short: $short_name" if $short_name;
13609                     if ($short_name && $single_form) {
13610                         $parenthesized .= ', ';
13611                     }
13612                     elsif ($single_form) {
13613                         $parenthesized .= 'Single: ';
13614                     }
13615                     $parenthesized .= $single_form if $single_form;
13616                 }
13617             }
13618
13619             if ($caseless_equivalent != 0) {
13620                 $parenthesized .=  '; ' if $parenthesized ne "";
13621                 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
13622             }
13623
13624
13625             # Warn if this property isn't the same as one that a
13626             # semi-casual user might expect.  The other components of this
13627             # parenthesized structure are calculated only for the first entry
13628             # for this table, but the conflicting is deemed important enough
13629             # to go on every entry.
13630             my $conflicting = join " NOR ", $table->conflicting;
13631             if ($conflicting) {
13632                 $parenthesized .=  '; ' if $parenthesized ne "";
13633                 $parenthesized .= "NOT $conflicting";
13634             }
13635
13636             push @info, "($parenthesized)" if $parenthesized;
13637
13638             if ($name =~ /_$/ && $alias->loose_match) {
13639                 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
13640             }
13641
13642             if ($table_property != $perl && $table->perl_extension) {
13643                 push @info, '(Perl extension)';
13644             }
13645             push @info, "($string_count)";
13646
13647             # Now, we have both the entry and info so add them to the
13648             # list of all the properties.
13649             push @match_properties,
13650                 format_pod_line($indent_info_column,
13651                                 $entry,
13652                                 join( " ", @info),
13653                                 $alias->status,
13654                                 $alias->loose_match);
13655
13656             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
13657         } # End of looping through the aliases for this table.
13658
13659         if (! $entry_for_first_table) {
13660             $entry_for_first_table = $entry_for_first_alias;
13661         }
13662     } # End of looping through all the related tables
13663     return;
13664 }
13665
13666 sub make_ucd_table_pod_entries {
13667     my $table = shift;
13668
13669     # Generate the entries for the UCD section of the pod for $table.  This
13670     # also calculates if names are ambiguous, so has to be called even if the
13671     # pod is not being output
13672
13673     my $short_name = $table->name;
13674     my $standard_short_name = standardize($short_name);
13675     my $full_name = $table->full_name;
13676     my $standard_full_name = standardize($full_name);
13677
13678     my $full_info = "";     # Text of info column for full-name entries
13679     my $other_info = "";    # Text of info column for short-name entries
13680     my $short_info = "";    # Text of info column for other entries
13681     my $meaning = "";       # Synonym of this table
13682
13683     my $property = ($table->isa('Property'))
13684                    ? $table
13685                    : $table->parent->property;
13686
13687     my $perl_extension = $table->perl_extension;
13688
13689     # Get the more official name for for perl extensions that aren't
13690     # stand-alone properties
13691     if ($perl_extension && $property != $table) {
13692         if ($property == $perl ||$property->type == $BINARY) {
13693             $meaning = $table->complete_name;
13694         }
13695         else {
13696             $meaning = $property->full_name . "=$full_name";
13697         }
13698     }
13699
13700     # There are three types of info column.  One for the short name, one for
13701     # the full name, and one for everything else.  They mostly are the same,
13702     # so initialize in the same loop.
13703     foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
13704         if ($perl_extension && $property != $table) {
13705
13706             # Add the synonymous name for the non-full name entries; and to
13707             # the full-name entry if it adds extra information
13708             if ($info_ref == \$other_info
13709                 || ($info_ref == \$short_info
13710                     && $standard_short_name ne $standard_full_name)
13711                 || standardize($meaning) ne $standard_full_name
13712             ) {
13713                 $$info_ref .= "$meaning.";
13714             }
13715         }
13716         elsif ($info_ref != \$full_info) {
13717
13718             # Otherwise, the non-full name columns include the full name
13719             $$info_ref .= $full_name;
13720         }
13721
13722         # And the full-name entry includes the short name, if different
13723         if ($info_ref == \$full_info
13724             && $standard_short_name ne $standard_full_name)
13725         {
13726             $full_info =~ s/\.\Z//;
13727             $full_info .= "  " if $full_info;
13728             $full_info .= "(Short: $short_name)";
13729         }
13730
13731         if ($table->perl_extension) {
13732             $$info_ref =~ s/\.\Z//;
13733             $$info_ref .= ".  " if $$info_ref;
13734             $$info_ref .= "(Perl extension)";
13735         }
13736     }
13737
13738     # Add any extra annotations to the full name entry
13739     foreach my $more_info ($table->description,
13740                             $table->note,
13741                             $table->status_info)
13742     {
13743         next unless $more_info;
13744         $full_info =~ s/\.\Z//;
13745         $full_info .= ".  " if $full_info;
13746         $full_info .= $more_info;
13747     }
13748
13749     # These keep track if have created full and short name pod entries for the
13750     # property
13751     my $done_full = 0;
13752     my $done_short = 0;
13753
13754     # Every possible name is kept track of, even those that aren't going to be
13755     # output.  This way we can be sure to find the ambiguities.
13756     foreach my $alias ($table->aliases) {
13757         my $name = $alias->name;
13758         my $standard = standardize($name);
13759         my $info;
13760         my $output_this = $alias->ucd;
13761
13762         # If the full and short names are the same, we want to output the full
13763         # one's entry, so it has priority.
13764         if ($standard eq $standard_full_name) {
13765             next if $done_full;
13766             $done_full = 1;
13767             $info = $full_info;
13768         }
13769         elsif ($standard eq $standard_short_name) {
13770             next if $done_short;
13771             $done_short = 1;
13772             next if $standard_short_name eq $standard_full_name;
13773             $info = $short_info;
13774         }
13775         else {
13776             $info = $other_info;
13777         }
13778
13779         # Here, we have set up the two columns for this entry.  But if an
13780         # entry already exists for this name, we have to decide which one
13781         # we're going to later output.
13782         if (exists $ucd_pod{$standard}) {
13783
13784             # If the two entries refer to the same property, it's not going to
13785             # be ambiguous.  (Likely it's because the names when standardized
13786             # are the same.)  But that means if they are different properties,
13787             # there is ambiguity.
13788             if ($ucd_pod{$standard}->{'property'} != $property) {
13789
13790                 # Here, we have an ambiguity.  This code assumes that one is
13791                 # scheduled to be output and one not and that one is a perl
13792                 # extension (which is not to be output) and the other isn't.
13793                 # If those assumptions are wrong, things have to be rethought.
13794                 if ($ucd_pod{$standard}{'output_this'} == $output_this
13795                     || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
13796                     || $output_this == $perl_extension)
13797                 {
13798                     Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
13799                 }
13800
13801                 # We modifiy the info column of the one being output to
13802                 # indicate the ambiguity.  Set $which to point to that one's
13803                 # info.
13804                 my $which;
13805                 if ($ucd_pod{$standard}{'output_this'}) {
13806                     $which = \$ucd_pod{$standard}->{'info'};
13807                 }
13808                 else {
13809                     $which = \$info;
13810                     $meaning = $ucd_pod{$standard}{'meaning'};
13811                 }
13812
13813                 chomp $$which;
13814                 $$which =~ s/\.\Z//;
13815                 $$which .= "; NOT '$standard' meaning '$meaning'";
13816
13817                 $ambiguous_names{$standard} = 1;
13818             }
13819
13820             # Use the non-perl-extension variant
13821             next unless $ucd_pod{$standard}{'perl_extension'};
13822         }
13823
13824         # Store enough information about this entry that we can later look for
13825         # ambiguities, and output it properly.
13826         $ucd_pod{$standard} = { 'name' => $name,
13827                                 'info' => $info,
13828                                 'meaning' => $meaning,
13829                                 'output_this' => $output_this,
13830                                 'perl_extension' => $perl_extension,
13831                                 'property' => $property,
13832                                 'status' => $alias->status,
13833         };
13834     } # End of looping through all this table's aliases
13835
13836     return;
13837 }
13838
13839 sub pod_alphanumeric_sort {
13840     # Sort pod entries alphanumerically.
13841
13842     # The first few character columns are filler, plus the '\p{'; and get rid
13843     # of all the trailing stuff, starting with the trailing '}', so as to sort
13844     # on just 'Name=Value'
13845     (my $a = lc $a) =~ s/^ .*? { //x;
13846     $a =~ s/}.*//;
13847     (my $b = lc $b) =~ s/^ .*? { //x;
13848     $b =~ s/}.*//;
13849
13850     # Determine if the two operands are both internal only or both not.
13851     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
13852     # should be the underscore that begins internal only
13853     my $a_is_internal = (substr($a, 0, 1) eq '_');
13854     my $b_is_internal = (substr($b, 0, 1) eq '_');
13855
13856     # Sort so the internals come last in the table instead of first (which the
13857     # leading underscore would otherwise indicate).
13858     if ($a_is_internal != $b_is_internal) {
13859         return 1 if $a_is_internal;
13860         return -1
13861     }
13862
13863     # Determine if the two operands are numeric property values or not.
13864     # A numeric property will look like xyz: 3.  But the number
13865     # can begin with an optional minus sign, and may have a
13866     # fraction or rational component, like xyz: 3/2.  If either
13867     # isn't numeric, use alphabetic sort.
13868     my ($a_initial, $a_number) =
13869         ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
13870     return $a cmp $b unless defined $a_number;
13871     my ($b_initial, $b_number) =
13872         ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
13873     return $a cmp $b unless defined $b_number;
13874
13875     # Here they are both numeric, but use alphabetic sort if the
13876     # initial parts don't match
13877     return $a cmp $b if $a_initial ne $b_initial;
13878
13879     # Convert rationals to floating for the comparison.
13880     $a_number = eval $a_number if $a_number =~ qr{/};
13881     $b_number = eval $b_number if $b_number =~ qr{/};
13882
13883     return $a_number <=> $b_number;
13884 }
13885
13886 sub make_pod () {
13887     # Create the .pod file.  This generates the various subsections and then
13888     # combines them in one big HERE document.
13889
13890     my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
13891
13892     return unless defined $pod_directory;
13893     print "Making pod file\n" if $verbosity >= $PROGRESS;
13894
13895     my $exception_message =
13896     '(Any exceptions are individually noted beginning with the word NOT.)';
13897     my @block_warning;
13898     if (-e 'Blocks.txt') {
13899
13900         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
13901         # if the global $has_In_conflicts indicates we have them.
13902         push @match_properties, format_pod_line($indent_info_column,
13903                                                 '\p{In_*}',
13904                                                 '\p{Block: *}'
13905                                                     . (($has_In_conflicts)
13906                                                       ? " $exception_message"
13907                                                       : ""));
13908         @block_warning = << "END";
13909
13910 Matches in the Block property have shortcuts that begin with "In_".  For
13911 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>.  For
13912 backward compatibility, if there is no conflict with another shortcut, these
13913 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>.  But, N.B., there
13914 are numerous such conflicting shortcuts.  Use of these forms for Block is
13915 discouraged, and are flagged as such, not only because of the potential
13916 confusion as to what is meant, but also because a later release of Unicode may
13917 preempt the shortcut, and your program would no longer be correct.  Use the
13918 "In_" form instead to avoid this, or even more clearly, use the compound form,
13919 e.g., C<\\p{blk:latin1}>.  See L<perlunicode/"Blocks"> for more information
13920 about this.
13921 END
13922     }
13923     my $text = $Is_flags_text;
13924     $text = "$exception_message $text" if $has_Is_conflicts;
13925
13926     # And the 'Is_ line';
13927     push @match_properties, format_pod_line($indent_info_column,
13928                                             '\p{Is_*}',
13929                                             "\\p{*} $text");
13930
13931     # Sort the properties array for output.  It is sorted alphabetically
13932     # except numerically for numeric properties, and only output unique lines.
13933     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
13934
13935     my $formatted_properties = simple_fold(\@match_properties,
13936                                         "",
13937                                         # indent succeeding lines by two extra
13938                                         # which looks better
13939                                         $indent_info_column + 2,
13940
13941                                         # shorten the line length by how much
13942                                         # the formatter indents, so the folded
13943                                         # line will fit in the space
13944                                         # presumably available
13945                                         $automatic_pod_indent);
13946     # Add column headings, indented to be a little more centered, but not
13947     # exactly
13948     $formatted_properties =  format_pod_line($indent_info_column,
13949                                                     '    NAME',
13950                                                     '           INFO')
13951                                     . "\n"
13952                                     . $formatted_properties;
13953
13954     # Generate pod documentation lines for the tables that match nothing
13955     my $zero_matches = "";
13956     if (@zero_match_tables) {
13957         @zero_match_tables = uniques(@zero_match_tables);
13958         $zero_matches = join "\n\n",
13959                         map { $_ = '=item \p{' . $_->complete_name . "}" }
13960                             sort { $a->complete_name cmp $b->complete_name }
13961                             @zero_match_tables;
13962
13963         $zero_matches = <<END;
13964
13965 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
13966
13967 Unicode has some property-value pairs that currently don't match anything.
13968 This happens generally either because they are obsolete, or they exist for
13969 symmetry with other forms, but no language has yet been encoded that uses
13970 them.  In this version of Unicode, the following match zero code points:
13971
13972 =over 4
13973
13974 $zero_matches
13975
13976 =back
13977
13978 END
13979     }
13980
13981     # Generate list of properties that we don't accept, grouped by the reasons
13982     # why.  This is so only put out the 'why' once, and then list all the
13983     # properties that have that reason under it.
13984
13985     my %why_list;   # The keys are the reasons; the values are lists of
13986                     # properties that have the key as their reason
13987
13988     # For each property, add it to the list that are suppressed for its reason
13989     # The sort will cause the alphabetically first properties to be added to
13990     # each list first, so each list will be sorted.
13991     foreach my $property (sort keys %why_suppressed) {
13992         push @{$why_list{$why_suppressed{$property}}}, $property;
13993     }
13994
13995     # For each reason (sorted by the first property that has that reason)...
13996     my @bad_re_properties;
13997     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
13998                      keys %why_list)
13999     {
14000         # Add to the output, all the properties that have that reason.
14001         my $has_item = 0;   # Flag if actually output anything.
14002         foreach my $name (@{$why_list{$why}}) {
14003
14004             # Split compound names into $property and $table components
14005             my $property = $name;
14006             my $table;
14007             if ($property =~ / (.*) = (.*) /x) {
14008                 $property = $1;
14009                 $table = $2;
14010             }
14011
14012             # This release of Unicode may not have a property that is
14013             # suppressed, so don't reference a non-existent one.
14014             $property = property_ref($property);
14015             next if ! defined $property;
14016
14017             # And since this list is only for match tables, don't list the
14018             # ones that don't have match tables.
14019             next if ! $property->to_create_match_tables;
14020
14021             # Find any abbreviation, and turn it into a compound name if this
14022             # is a property=value pair.
14023             my $short_name = $property->name;
14024             $short_name .= '=' . $property->table($table)->name if $table;
14025
14026             # Start with an empty line.
14027             push @bad_re_properties, "\n\n" unless $has_item;
14028
14029             # And add the property as an item for the reason.
14030             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
14031             $has_item = 1;
14032         }
14033
14034         # And add the reason under the list of properties, if such a list
14035         # actually got generated.  Note that the header got added
14036         # unconditionally before.  But pod ignores extra blank lines, so no
14037         # harm.
14038         push @bad_re_properties, "\n$why\n" if $has_item;
14039
14040     } # End of looping through each reason.
14041
14042     if (! @bad_re_properties) {
14043         push @bad_re_properties,
14044                 "*** This installation accepts ALL non-Unihan properties ***";
14045     }
14046     else {
14047         # Add =over only if non-empty to avoid an empty =over/=back section,
14048         # which is considered bad form.
14049         unshift @bad_re_properties, "\n=over 4\n";
14050         push @bad_re_properties, "\n=back\n";
14051     }
14052
14053     # Similiarly, generate a list of files that we don't use, grouped by the
14054     # reasons why.  First, create a hash whose keys are the reasons, and whose
14055     # values are anonymous arrays of all the files that share that reason.
14056     my %grouped_by_reason;
14057     foreach my $file (keys %ignored_files) {
14058         push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
14059     }
14060     foreach my $file (keys %skipped_files) {
14061         push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
14062     }
14063
14064     # Then, sort each group.
14065     foreach my $group (keys %grouped_by_reason) {
14066         @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
14067                                         @{$grouped_by_reason{$group}} ;
14068     }
14069
14070     # Finally, create the output text.  For each reason (sorted by the
14071     # alphabetically first file that has that reason)...
14072     my @unused_files;
14073     foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
14074                                cmp lc $grouped_by_reason{$b}->[0]
14075                               }
14076                          keys %grouped_by_reason)
14077     {
14078         # Add all the files that have that reason to the output.  Start
14079         # with an empty line.
14080         push @unused_files, "\n\n";
14081         push @unused_files, map { "\n=item F<$_> \n" }
14082                             @{$grouped_by_reason{$reason}};
14083         # And add the reason under the list of files
14084         push @unused_files, "\n$reason\n";
14085     }
14086
14087     # Similarly, create the output text for the UCD section of the pod
14088     my @ucd_pod;
14089     foreach my $key (keys %ucd_pod) {
14090         next unless $ucd_pod{$key}->{'output_this'};
14091         push @ucd_pod, format_pod_line($indent_info_column,
14092                                        $ucd_pod{$key}->{'name'},
14093                                        $ucd_pod{$key}->{'info'},
14094                                        $ucd_pod{$key}->{'status'},
14095                                       );
14096     }
14097
14098     # Sort alphabetically, and fold for output
14099     @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
14100     my $ucd_pod = simple_fold(\@ucd_pod,
14101                            ' ',
14102                            $indent_info_column,
14103                            $automatic_pod_indent);
14104     $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
14105                 . "\n"
14106                 . $ucd_pod;
14107     local $" = "";
14108
14109     # Everything is ready to assemble.
14110     my @OUT = << "END";
14111 =begin comment
14112
14113 $HEADER
14114
14115 To change this file, edit $0 instead.
14116
14117 =end comment
14118
14119 =head1 NAME
14120
14121 $pod_file - Index of Unicode Version $string_version character properties in Perl
14122
14123 =head1 DESCRIPTION
14124
14125 This document provides information about the portion of the Unicode database
14126 that deals with character properties, that is the portion that is defined on
14127 single code points.  (L</Other information in the Unicode data base>
14128 below briefly mentions other data that Unicode provides.)
14129
14130 Perl can provide access to all non-provisional Unicode character properties,
14131 though not all are enabled by default.  The omitted ones are the Unihan
14132 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
14133 deprecated or Unicode-internal properties.  (An installation may choose to
14134 recompile Perl's tables to change this.  See L<Unicode character
14135 properties that are NOT accepted by Perl>.)
14136
14137 For most purposes, access to Unicode properties from the Perl core is through
14138 regular expression matches, as described in the next section.
14139 For some special purposes, and to access the properties that are not suitable
14140 for regular expression matching, all the Unicode character properties that
14141 Perl handles are accessible via the standard L<Unicode::UCD> module, as
14142 described in the section L</Properties accessible through Unicode::UCD>.
14143
14144 Perl also provides some additional extensions and short-cut synonyms
14145 for Unicode properties.
14146
14147 This document merely lists all available properties and does not attempt to
14148 explain what each property really means.  There is a brief description of each
14149 Perl extension; see L<perlunicode/Other Properties> for more information on
14150 these.  There is some detail about Blocks, Scripts, General_Category,
14151 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
14152 official Unicode properties, refer to the Unicode standard.  A good starting
14153 place is L<$unicode_reference_url>.
14154
14155 Note that you can define your own properties; see
14156 L<perlunicode/"User-Defined Character Properties">.
14157
14158 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
14159
14160 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
14161 most of the Unicode character properties.  The table below shows all these
14162 constructs, both single and compound forms.
14163
14164 B<Compound forms> consist of two components, separated by an equals sign or a
14165 colon.  The first component is the property name, and the second component is
14166 the particular value of the property to match against, for example,
14167 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
14168 whose Script property is Greek.
14169
14170 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
14171 their equivalent compound forms.  The table shows these equivalences.  (In our
14172 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
14173 There are also a few Perl-defined single forms that are not shortcuts for a
14174 compound form.  One such is C<\\p{Word}>.  These are also listed in the table.
14175
14176 In parsing these constructs, Perl always ignores Upper/lower case differences
14177 everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
14178 C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
14179 the left brace completely changes the meaning of the construct, from "match"
14180 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
14181 for improved legibility.
14182
14183 Also, white space, hyphens, and underscores are also normally ignored
14184 everywhere between the {braces}, and hence can be freely added or removed
14185 even if the C</x> modifier hasn't been specified on the regular expression.
14186 But $a_bold_stricter at the beginning of an entry in the table below
14187 means that tighter (stricter) rules are used for that entry:
14188
14189 =over 4
14190
14191 =item Single form (C<\\p{name}>) tighter rules:
14192
14193 White space, hyphens, and underscores ARE significant
14194 except for:
14195
14196 =over 4
14197
14198 =item * white space adjacent to a non-word character
14199
14200 =item * underscores separating digits in numbers
14201
14202 =back
14203
14204 That means, for example, that you can freely add or remove white space
14205 adjacent to (but within) the braces without affecting the meaning.
14206
14207 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
14208
14209 The tighter rules given above for the single form apply to everything to the
14210 right of the colon or equals; the looser rules still apply to everything to
14211 the left.
14212
14213 That means, for example, that you can freely add or remove white space
14214 adjacent to (but within) the braces and the colon or equal sign.
14215
14216 =back
14217
14218 Some properties are considered obsolete by Unicode, but still available.
14219 There are several varieties of obsolescence:
14220
14221 =over 4
14222
14223 =item Stabilized
14224
14225 A property may be stabilized.  Such a determination does not indicate
14226 that the property should or should not be used; instead it is a declaration
14227 that the property will not be maintained nor extended for newly encoded
14228 characters.  Such properties are marked with $a_bold_stabilized in the
14229 table.
14230
14231 =item Deprecated
14232
14233 A property may be deprecated, perhaps because its original intent
14234 has been replaced by another property, or because its specification was
14235 somehow defective.  This means that its use is strongly
14236 discouraged, so much so that a warning will be issued if used, unless the
14237 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
14238 statement.  $A_bold_deprecated flags each such entry in the table, and
14239 the entry there for the longest, most descriptive version of the property will
14240 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
14241 warning, even for properties that aren't officially deprecated by Unicode,
14242 when there used to be characters or code points that were matched by them, but
14243 no longer.  This is to warn you that your program may not work like it did on
14244 earlier Unicode releases.
14245
14246 A deprecated property may be made unavailable in a future Perl version, so it
14247 is best to move away from them.
14248
14249 A deprecated property may also be stabilized, but this fact is not shown.
14250
14251 =item Obsolete
14252
14253 Properties marked with $a_bold_obsolete in the table are considered (plain)
14254 obsolete.  Generally this designation is given to properties that Unicode once
14255 used for internal purposes (but not any longer).
14256
14257 =back
14258
14259 Some Perl extensions are present for backwards compatibility and are
14260 discouraged from being used, but are not obsolete.  $A_bold_discouraged
14261 flags each such entry in the table.  Future Unicode versions may force
14262 some of these extensions to be removed without warning, replaced by another
14263 property with the same name that means something different.  Use the
14264 equivalent shown instead.
14265
14266 @block_warning
14267
14268 The table below has two columns.  The left column contains the C<\\p{}>
14269 constructs to look up, possibly preceded by the flags mentioned above; and
14270 the right column contains information about them, like a description, or
14271 synonyms.  It shows both the single and compound forms for each property that
14272 has them.  If the left column is a short name for a property, the right column
14273 will give its longer, more descriptive name; and if the left column is the
14274 longest name, the right column will show any equivalent shortest name, in both
14275 single and compound forms if applicable.
14276
14277 The right column will also caution you if a property means something different
14278 than what might normally be expected.
14279
14280 All single forms are Perl extensions; a few compound forms are as well, and
14281 are noted as such.
14282
14283 Numbers in (parentheses) indicate the total number of code points matched by
14284 the property.  For emphasis, those properties that match no code points at all
14285 are listed as well in a separate section following the table.
14286
14287 Most properties match the same code points regardless of whether C<"/i">
14288 case-insensitive matching is specified or not.  But a few properties are
14289 affected.  These are shown with the notation
14290
14291  (/i= other_property)
14292
14293 in the second column.  Under case-insensitive matching they match the
14294 same code pode points as the property "other_property".
14295
14296 There is no description given for most non-Perl defined properties (See
14297 L<$unicode_reference_url> for that).
14298
14299 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
14300 combinations.  For example, entries like:
14301
14302  \\p{Gc: *}                                  \\p{General_Category: *}
14303
14304 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
14305 for the latter is also valid for the former.  Similarly,
14306
14307  \\p{Is_*}                                   \\p{*}
14308
14309 means that if and only if, for example, C<\\p{Foo}> exists, then
14310 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
14311 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
14312 C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
14313 underscore.
14314
14315 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
14316 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
14317 'N*' to indicate this, and doesn't have separate entries for the other
14318 possibilities.  Note that not all properties which have values 'Yes' and 'No'
14319 are binary, and they have all their values spelled out without using this wild
14320 card, and a C<NOT> clause in their description that highlights their not being
14321 binary.  These also require the compound form to match them, whereas true
14322 binary properties have both single and compound forms available.
14323
14324 Note that all non-essential underscores are removed in the display of the
14325 short names below.
14326
14327 B<Legend summary:>
14328
14329 =over 4
14330
14331 =item Z<>B<*> is a wild-card
14332
14333 =item B<(\\d+)> in the info column gives the number of code points matched by
14334 this property.
14335
14336 =item B<$DEPRECATED> means this is deprecated.
14337
14338 =item B<$OBSOLETE> means this is obsolete.
14339
14340 =item B<$STABILIZED> means this is stabilized.
14341
14342 =item B<$STRICTER> means tighter (stricter) name matching applies.
14343
14344 =item B<$DISCOURAGED> means use of this form is discouraged, and may not be
14345 stable.
14346
14347 =back
14348
14349 $formatted_properties
14350
14351 $zero_matches
14352
14353 =head1 Properties accessible through Unicode::UCD
14354
14355 All the Unicode character properties mentioned above (except for those marked
14356 as for internal use by Perl) are also accessible by
14357 L<Unicode::UCD/prop_invlist()>.
14358
14359 Due to their nature, not all Unicode character properties are suitable for
14360 regular expression matches, nor C<prop_invlist()>.  The remaining
14361 non-provisional, non-internal ones are accessible via
14362 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
14363 hasn't included; see L<below for which those are|/Unicode character properties
14364 that are NOT accepted by Perl>).
14365
14366 For compatibility with other parts of Perl, all the single forms given in the
14367 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
14368 are recognized.  BUT, there are some ambiguities between some Perl extensions
14369 and the Unicode properties, all of which are silently resolved in favor of the
14370 official Unicode property.  To avoid surprises, you should only use
14371 C<prop_invmap()> for forms listed in the table below, which omits the
14372 non-recommended ones.  The affected forms are the Perl single form equivalents
14373 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
14374 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
14375 whose short name is C<sc>.  The table indicates the current ambiguities in the
14376 INFO column, beginning with the word C<"NOT">.
14377
14378 The standard Unicode properties listed below are documented in
14379 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
14380 L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
14381 L<perlunicode/Other Properties>;
14382
14383 The first column in the table is a name for the property; the second column is
14384 an alternative name, if any, plus possibly some annotations.  The alternative
14385 name is the property's full name, unless that would simply repeat the first
14386 column, in which case the second column indicates the property's short name
14387 (if different).  The annotations are given only in the entry for the full
14388 name.  If a property is obsolete, etc, the entry will be flagged with the same
14389 characters used in the table in the L<section above|/Properties accessible
14390 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
14391
14392 $ucd_pod
14393
14394 =head1 Properties accessible through other means
14395
14396 Certain properties are accessible also via core function calls.  These are:
14397
14398  Lowercase_Mapping          lc() and lcfirst()
14399  Titlecase_Mapping          ucfirst()
14400  Uppercase_Mapping          uc()
14401
14402 Also, Case_Folding is accessible through the C</i> modifier in regular
14403 expressions.
14404
14405 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
14406 interpolation in double-quoted strings and regular expressions; and functions
14407 C<charnames::viacode()>, C<charnames::vianame()>, and
14408 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
14409 specified.
14410
14411 Finally, most properties related to decomposition are accessible via
14412 L<Unicode::Normalize>.
14413
14414 =head1 Unicode character properties that are NOT accepted by Perl
14415
14416 Perl will generate an error for a few character properties in Unicode when
14417 used in a regular expression.  The non-Unihan ones are listed below, with the
14418 reasons they are not accepted, perhaps with work-arounds.  The short names for
14419 the properties are listed enclosed in (parentheses).
14420 As described after the list, an installation can change the defaults and choose
14421 to accept any of these.  The list is machine generated based on the
14422 choices made for the installation that generated this document.
14423
14424 @bad_re_properties
14425
14426 An installation can choose to allow any of these to be matched by downloading
14427 the Unicode database from L<http://www.unicode.org/Public/> to
14428 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
14429 controlling lists contained in the program
14430 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
14431 (C<\%Config> is available from the Config module).
14432
14433 =head1 Other information in the Unicode data base
14434
14435 The Unicode data base is delivered in two different formats.  The XML version
14436 is valid for more modern Unicode releases.  The other version is a collection
14437 of files.  The two are intended to give equivalent information.  Perl uses the
14438 older form; this allows you to recompile Perl to use early Unicode releases.
14439
14440 The only non-character property that Perl currently supports is Named
14441 Sequences, in which a sequence of code points
14442 is given a name and generally treated as a single entity.  (Perl supports
14443 these via the C<\\N{...}> double-quotish construct,
14444 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
14445
14446 Below is a list of the files in the Unicode data base that Perl doesn't
14447 currently use, along with very brief descriptions of their purposes.
14448 Some of the names of the files have been shortened from those that Unicode
14449 uses, in order to allow them to be distinguishable from similarly named files
14450 on file systems for which only the first 8 characters of a name are
14451 significant.
14452
14453 =over 4
14454
14455 @unused_files
14456
14457 =back
14458
14459 =head1 SEE ALSO
14460
14461 L<$unicode_reference_url>
14462
14463 L<perlrecharclass>
14464
14465 L<perlunicode>
14466
14467 END
14468
14469     # And write it.  The 0 means no utf8.
14470     main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
14471     return;
14472 }
14473
14474 sub make_Heavy () {
14475     # Create and write Heavy.pl, which passes info about the tables to
14476     # utf8_heavy.pl
14477
14478     # Stringify structures for output
14479     my $loose_property_name_of
14480                            = simple_dumper(\%loose_property_name_of, ' ' x 4);
14481     chomp $loose_property_name_of;
14482
14483     my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
14484     chomp $stricter_to_file_of;
14485
14486     my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
14487     chomp $loose_to_file_of;
14488
14489     my $nv_floating_to_rational
14490                            = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
14491     chomp $nv_floating_to_rational;
14492
14493     my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
14494     chomp $why_deprecated;
14495
14496     # We set the key to the file when we associated files with tables, but we
14497     # couldn't do the same for the value then, as we might not have the file
14498     # for the alternate table figured out at that time.
14499     foreach my $cased (keys %caseless_equivalent_to) {
14500         my @path = $caseless_equivalent_to{$cased}->file_path;
14501         my $path = join '/', @path[1, -1];
14502         $caseless_equivalent_to{$cased} = $path;
14503     }
14504     my $caseless_equivalent_to
14505                            = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
14506     chomp $caseless_equivalent_to;
14507
14508     my $loose_property_to_file_of
14509                         = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
14510     chomp $loose_property_to_file_of;
14511
14512     my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
14513     chomp $file_to_swash_name;
14514
14515     my @heavy = <<END;
14516 $HEADER
14517 $INTERNAL_ONLY_HEADER
14518
14519 # This file is for the use of utf8_heavy.pl and Unicode::UCD
14520
14521 # Maps Unicode (not Perl single-form extensions) property names in loose
14522 # standard form to their corresponding standard names
14523 \%utf8::loose_property_name_of = (
14524 $loose_property_name_of
14525 );
14526
14527 # Maps property, table to file for those using stricter matching
14528 \%utf8::stricter_to_file_of = (
14529 $stricter_to_file_of
14530 );
14531
14532 # Maps property, table to file for those using loose matching
14533 \%utf8::loose_to_file_of = (
14534 $loose_to_file_of
14535 );
14536
14537 # Maps floating point to fractional form
14538 \%utf8::nv_floating_to_rational = (
14539 $nv_floating_to_rational
14540 );
14541
14542 # If a floating point number doesn't have enough digits in it to get this
14543 # close to a fraction, it isn't considered to be that fraction even if all the
14544 # digits it does have match.
14545 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
14546
14547 # Deprecated tables to generate a warning for.  The key is the file containing
14548 # the table, so as to avoid duplication, as many property names can map to the
14549 # file, but we only need one entry for all of them.
14550 \%utf8::why_deprecated = (
14551 $why_deprecated
14552 );
14553
14554 # A few properties have different behavior under /i matching.  This maps
14555 # those to substitute files to use under /i.
14556 \%utf8::caseless_equivalent = (
14557 $caseless_equivalent_to
14558 );
14559
14560 # Property names to mapping files
14561 \%utf8::loose_property_to_file_of = (
14562 $loose_property_to_file_of
14563 );
14564
14565 # Files to the swash names within them.
14566 \%utf8::file_to_swash_name = (
14567 $file_to_swash_name
14568 );
14569
14570 1;
14571 END
14572
14573     main::write("Heavy.pl", 0, \@heavy);  # The 0 means no utf8.
14574     return;
14575 }
14576
14577 sub make_Name_pm () {
14578     # Create and write Name.pm, which contains subroutines and data to use in
14579     # conjunction with Name.pl
14580
14581     # Maybe there's nothing to do.
14582     return unless $has_hangul_syllables || @code_points_ending_in_code_point;
14583
14584     my @name = <<END;
14585 $HEADER
14586 $INTERNAL_ONLY_HEADER
14587 END
14588
14589     # Convert these structures to output format.
14590     my $code_points_ending_in_code_point =
14591         main::simple_dumper(\@code_points_ending_in_code_point,
14592                             ' ' x 8);
14593     my $names = main::simple_dumper(\%names_ending_in_code_point,
14594                                     ' ' x 8);
14595     my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
14596                                     ' ' x 8);
14597
14598     # Do the same with the Hangul names,
14599     my $jamo;
14600     my $jamo_l;
14601     my $jamo_v;
14602     my $jamo_t;
14603     my $jamo_re;
14604     if ($has_hangul_syllables) {
14605
14606         # Construct a regular expression of all the possible
14607         # combinations of the Hangul syllables.
14608         my @L_re;   # Leading consonants
14609         for my $i ($LBase .. $LBase + $LCount - 1) {
14610             push @L_re, $Jamo{$i}
14611         }
14612         my @V_re;   # Middle vowels
14613         for my $i ($VBase .. $VBase + $VCount - 1) {
14614             push @V_re, $Jamo{$i}
14615         }
14616         my @T_re;   # Trailing consonants
14617         for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
14618             push @T_re, $Jamo{$i}
14619         }
14620
14621         # The whole re is made up of the L V T combination.
14622         $jamo_re = '('
14623                     . join ('|', sort @L_re)
14624                     . ')('
14625                     . join ('|', sort @V_re)
14626                     . ')('
14627                     . join ('|', sort @T_re)
14628                     . ')?';
14629
14630         # These hashes needed by the algorithm were generated
14631         # during reading of the Jamo.txt file
14632         $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
14633         $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
14634         $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
14635         $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
14636     }
14637
14638     push @name, <<END;
14639
14640 package charnames;
14641
14642 # This module contains machine-generated tables and code for the
14643 # algorithmically-determinable Unicode character names.  The following
14644 # routines can be used to translate between name and code point and vice versa
14645
14646 { # Closure
14647
14648     # Matches legal code point.  4-6 hex numbers, If there are 6, the first
14649     # two must be 10; if there are 5, the first must not be a 0.  Written this
14650     # way to decrease backtracking.  The first regex allows the code point to
14651     # be at the end of a word, but to work properly, the word shouldn't end
14652     # with a valid hex character.  The second one won't match a code point at
14653     # the end of a word, and doesn't have the run-on issue
14654     my \$run_on_code_point_re = qr/$run_on_code_point_re/;
14655     my \$code_point_re = qr/$code_point_re/;
14656
14657     # In the following hash, the keys are the bases of names which includes
14658     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The values
14659     # of each key is another hash which is used to get the low and high ends
14660     # for each range of code points that apply to the name.
14661     my %names_ending_in_code_point = (
14662 $names
14663     );
14664
14665     # The following hash is a copy of the previous one, except is for loose
14666     # matching, so each name has blanks and dashes squeezed out
14667     my %loose_names_ending_in_code_point = (
14668 $loose_names
14669     );
14670
14671     # And the following array gives the inverse mapping from code points to
14672     # names.  Lowest code points are first
14673     my \@code_points_ending_in_code_point = (
14674 $code_points_ending_in_code_point
14675     );
14676 END
14677     # Earlier releases didn't have Jamos.  No sense outputting
14678     # them unless will be used.
14679     if ($has_hangul_syllables) {
14680         push @name, <<END;
14681
14682     # Convert from code point to Jamo short name for use in composing Hangul
14683     # syllable names
14684     my %Jamo = (
14685 $jamo
14686     );
14687
14688     # Leading consonant (can be null)
14689     my %Jamo_L = (
14690 $jamo_l
14691     );
14692
14693     # Vowel
14694     my %Jamo_V = (
14695 $jamo_v
14696     );
14697
14698     # Optional trailing consonant
14699     my %Jamo_T = (
14700 $jamo_t
14701     );
14702
14703     # Computed re that splits up a Hangul name into LVT or LV syllables
14704     my \$syllable_re = qr/$jamo_re/;
14705
14706     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
14707     my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
14708
14709     # These constants names and values were taken from the Unicode standard,
14710     # version 5.1, section 3.12.  They are used in conjunction with Hangul
14711     # syllables
14712     my \$SBase = $SBase_string;
14713     my \$LBase = $LBase_string;
14714     my \$VBase = $VBase_string;
14715     my \$TBase = $TBase_string;
14716     my \$SCount = $SCount;
14717     my \$LCount = $LCount;
14718     my \$VCount = $VCount;
14719     my \$TCount = $TCount;
14720     my \$NCount = \$VCount * \$TCount;
14721 END
14722     } # End of has Jamos
14723
14724     push @name, << 'END';
14725
14726     sub name_to_code_point_special {
14727         my ($name, $loose) = @_;
14728
14729         # Returns undef if not one of the specially handled names; otherwise
14730         # returns the code point equivalent to the input name
14731         # $loose is non-zero if to use loose matching, 'name' in that case
14732         # must be input as upper case with all blanks and dashes squeezed out.
14733 END
14734     if ($has_hangul_syllables) {
14735         push @name, << 'END';
14736
14737         if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
14738             || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
14739         {
14740             return if $name !~ qr/^$syllable_re$/;
14741             my $L = $Jamo_L{$1};
14742             my $V = $Jamo_V{$2};
14743             my $T = (defined $3) ? $Jamo_T{$3} : 0;
14744             return ($L * $VCount + $V) * $TCount + $T + $SBase;
14745         }
14746 END
14747     }
14748     push @name, << 'END';
14749
14750         # Name must end in 'code_point' for this to handle.
14751         return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
14752                    || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
14753
14754         my $base = $1;
14755         my $code_point = CORE::hex $2;
14756         my $names_ref;
14757
14758         if ($loose) {
14759             $names_ref = \%loose_names_ending_in_code_point;
14760         }
14761         else {
14762             return if $base !~ s/-$//;
14763             $names_ref = \%names_ending_in_code_point;
14764         }
14765
14766         # Name must be one of the ones which has the code point in it.
14767         return if ! $names_ref->{$base};
14768
14769         # Look through the list of ranges that apply to this name to see if
14770         # the code point is in one of them.
14771         for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
14772             return if $names_ref->{$base}{'low'}->[$i] > $code_point;
14773             next if $names_ref->{$base}{'high'}->[$i] < $code_point;
14774
14775             # Here, the code point is in the range.
14776             return $code_point;
14777         }
14778
14779         # Here, looked like the name had a code point number in it, but
14780         # did not match one of the valid ones.
14781         return;
14782     }
14783
14784     sub code_point_to_name_special {
14785         my $code_point = shift;
14786
14787         # Returns the name of a code point if algorithmically determinable;
14788         # undef if not
14789 END
14790     if ($has_hangul_syllables) {
14791         push @name, << 'END';
14792
14793         # If in the Hangul range, calculate the name based on Unicode's
14794         # algorithm
14795         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
14796             use integer;
14797             my $SIndex = $code_point - $SBase;
14798             my $L = $LBase + $SIndex / $NCount;
14799             my $V = $VBase + ($SIndex % $NCount) / $TCount;
14800             my $T = $TBase + $SIndex % $TCount;
14801             $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
14802             $name .= $Jamo{$T} if $T != $TBase;
14803             return $name;
14804         }
14805 END
14806     }
14807     push @name, << 'END';
14808
14809         # Look through list of these code points for one in range.
14810         foreach my $hash (@code_points_ending_in_code_point) {
14811             return if $code_point < $hash->{'low'};
14812             if ($code_point <= $hash->{'high'}) {
14813                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
14814             }
14815         }
14816         return;            # None found
14817     }
14818 } # End closure
14819
14820 1;
14821 END
14822
14823     main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
14824     return;
14825 }
14826
14827 sub make_UCD () {
14828     # Create and write UCD.pl, which passes info about the tables to
14829     # Unicode::UCD
14830
14831     # Create a mapping from each alias of Perl single-form extensions to all
14832     # its equivalent aliases, for quick look-up.
14833     my %perlprop_to_aliases;
14834     foreach my $table ($perl->tables) {
14835
14836         # First create the list of the aliases of each extension
14837         my @aliases_list;    # List of legal aliases for this extension
14838
14839         my $table_name = $table->name;
14840         my $standard_table_name = standardize($table_name);
14841         my $table_full_name = $table->full_name;
14842         my $standard_table_full_name = standardize($table_full_name);
14843
14844         # Make sure that the list has both the short and full names
14845         push @aliases_list, $table_name, $table_full_name;
14846
14847         my $found_ucd = 0;  # ? Did we actually get an alias that should be
14848                             # output for this table
14849
14850         # Go through all the aliases (including the two just added), and add
14851         # any new unique ones to the list
14852         foreach my $alias ($table->aliases) {
14853
14854             # Skip non-legal names
14855             next unless $alias->ok_as_filename;
14856             next unless $alias->ucd;
14857
14858             $found_ucd = 1;     # have at least one legal name
14859
14860             my $name = $alias->name;
14861             my $standard = standardize($name);
14862
14863             # Don't repeat a name that is equivalent to one already on the
14864             # list
14865             next if $standard eq $standard_table_name;
14866             next if $standard eq $standard_table_full_name;
14867
14868             push @aliases_list, $name;
14869         }
14870
14871         # If there were no legal names, don't output anything.
14872         next unless $found_ucd;
14873
14874         # To conserve memory in the program reading these in, omit full names
14875         # that are identical to the short name, when those are the only two
14876         # aliases for the property.
14877         if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
14878             pop @aliases_list;
14879         }
14880
14881         # Here, @aliases_list is the list of all the aliases that this
14882         # extension legally has.  Now can create a map to it from each legal
14883         # standardized alias
14884         foreach my $alias ($table->aliases) {
14885             next unless $alias->ucd;
14886             next unless $alias->ok_as_filename;
14887             push @{$perlprop_to_aliases{standardize($alias->name)}},
14888                  @aliases_list;
14889         }
14890     }
14891
14892     # Make a list of all combinations of properties/values that are suppressed.
14893     my @suppressed;
14894     foreach my $property_name (keys %why_suppressed) {
14895
14896         # Just the value
14897         my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
14898
14899         # The hash may contain properties not in this release of Unicode
14900         next unless defined (my $property = property_ref($property_name));
14901
14902         # Find all combinations
14903         foreach my $prop_alias ($property->aliases) {
14904             my $prop_alias_name = standardize($prop_alias->name);
14905
14906             # If no =value, there's just one combination possibe for this
14907             if (! $value_name) {
14908
14909                 # The property may be suppressed, but there may be a proxy for
14910                 # it, so it shouldn't be listed as suppressed
14911                 next if $prop_alias->ucd;
14912                 push @suppressed, $prop_alias_name;
14913             }
14914             else {  # Otherwise
14915                 foreach my $value_alias ($property->table($value_name)->aliases)
14916                 {
14917                     next if $value_alias->ucd;
14918
14919                     push @suppressed, "$prop_alias_name="
14920                                       .  standardize($value_alias->name);
14921                 }
14922             }
14923         }
14924     }
14925
14926     # Convert the structure below (designed for Name.pm) to a form that UCD
14927     # wants, so it doesn't have to modify it at all; i.e. so that it includes
14928     # an element for the Hangul syllables in the appropriate place, and
14929     # otherwise changes the name to include the "-<code point>" suffix.
14930     my @algorithm_names;
14931     my $done_hangul = 0;
14932
14933     # Copy it linearly.
14934     for my $i (0 .. @code_points_ending_in_code_point - 1) {
14935
14936         # Insert the hanguls in the correct place.
14937         if (! $done_hangul
14938             && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
14939         {
14940             $done_hangul = 1;
14941             push @algorithm_names, { low => $SBase,
14942                                      high => $SBase + $SCount - 1,
14943                                      name => '<hangul syllable>',
14944                                     };
14945         }
14946
14947         # Copy the current entry, modified.
14948         push @algorithm_names, {
14949             low => $code_points_ending_in_code_point[$i]->{'low'},
14950             high => $code_points_ending_in_code_point[$i]->{'high'},
14951             name =>
14952                "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
14953         };
14954     }
14955
14956     # Serialize these structures for output.
14957     my $loose_to_standard_value
14958                           = simple_dumper(\%loose_to_standard_value, ' ' x 4);
14959     chomp $loose_to_standard_value;
14960
14961     my $string_property_loose_to_name
14962                     = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
14963     chomp $string_property_loose_to_name;
14964
14965     my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
14966     chomp $perlprop_to_aliases;
14967
14968     my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
14969     chomp $prop_aliases;
14970
14971     my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
14972     chomp $prop_value_aliases;
14973
14974     my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
14975     chomp $suppressed;
14976
14977     my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
14978     chomp $algorithm_names;
14979
14980     my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
14981     chomp $ambiguous_names;
14982
14983     my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
14984     chomp $loose_defaults;
14985
14986     my @ucd = <<END;
14987 $HEADER
14988 $INTERNAL_ONLY_HEADER
14989
14990 # This file is for the use of Unicode::UCD
14991
14992 # Highest legal Unicode code point
14993 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
14994
14995 # Hangul syllables
14996 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
14997 \$Unicode::UCD::HANGUL_COUNT = $SCount;
14998
14999 # Keys are all the possible "prop=value" combinations, in loose form; values
15000 # are the standard loose name for the 'value' part of the key
15001 \%Unicode::UCD::loose_to_standard_value = (
15002 $loose_to_standard_value
15003 );
15004
15005 # String property loose names to standard loose name
15006 \%Unicode::UCD::string_property_loose_to_name = (
15007 $string_property_loose_to_name
15008 );
15009
15010 # Keys are Perl extensions in loose form; values are each one's list of
15011 # aliases
15012 \%Unicode::UCD::loose_perlprop_to_name = (
15013 $perlprop_to_aliases
15014 );
15015
15016 # Keys are standard property name; values are each one's aliases
15017 \%Unicode::UCD::prop_aliases = (
15018 $prop_aliases
15019 );
15020
15021 # Keys of top level are standard property name; values are keys to another
15022 # hash,  Each one is one of the property's values, in standard form.  The
15023 # values are that prop-val's aliases.  If only one specified, the short and
15024 # long alias are identical.
15025 \%Unicode::UCD::prop_value_aliases = (
15026 $prop_value_aliases
15027 );
15028
15029 # Ordered (by code point ordinal) list of the ranges of code points whose
15030 # names are algorithmically determined.  Each range entry is an anonymous hash
15031 # of the start and end points and a template for the names within it.
15032 \@Unicode::UCD::algorithmic_named_code_points = (
15033 $algorithm_names
15034 );
15035
15036 # The properties that as-is have two meanings, and which must be disambiguated
15037 \%Unicode::UCD::ambiguous_names = (
15038 $ambiguous_names
15039 );
15040
15041 # Keys are the prop-val combinations which are the default values for the
15042 # given property, expressed in standard loose form
15043 \%Unicode::UCD::loose_defaults = (
15044 $loose_defaults
15045 );
15046
15047 # All combinations of names that are suppressed.
15048 # This is actually for UCD.t, so it knows which properties shouldn't have
15049 # entries.  If it got any bigger, would probably want to put it in its own
15050 # file to use memory only when it was needed, in testing.
15051 \@Unicode::UCD::suppressed_properties = (
15052 $suppressed
15053 );
15054
15055 1;
15056 END
15057
15058     main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
15059     return;
15060 }
15061
15062 sub write_all_tables() {
15063     # Write out all the tables generated by this program to files, as well as
15064     # the supporting data structures, pod file, and .t file.
15065
15066     my @writables;              # List of tables that actually get written
15067     my %match_tables_to_write;  # Used to collapse identical match tables
15068                                 # into one file.  Each key is a hash function
15069                                 # result to partition tables into buckets.
15070                                 # Each value is an array of the tables that
15071                                 # fit in the bucket.
15072
15073     # For each property ...
15074     # (sort so that if there is an immutable file name, it has precedence, so
15075     # some other property can't come in and take over its file name.  If b's
15076     # file name is defined, will return 1, meaning to take it first; don't
15077     # care if both defined, as they had better be different anyway.  And the
15078     # property named 'Perl' needs to be first (it doesn't have any immutable
15079     # file name) because empty properties are defined in terms of it's table
15080     # named 'Any'.)
15081     PROPERTY:
15082     foreach my $property (sort { return -1 if $a == $perl;
15083                                  return 1 if $b == $perl;
15084                                  return defined $b->file
15085                                 } property_ref('*'))
15086     {
15087         my $type = $property->type;
15088
15089         # And for each table for that property, starting with the mapping
15090         # table for it ...
15091         TABLE:
15092         foreach my $table($property,
15093
15094                         # and all the match tables for it (if any), sorted so
15095                         # the ones with the shortest associated file name come
15096                         # first.  The length sorting prevents problems of a
15097                         # longer file taking a name that might have to be used
15098                         # by a shorter one.  The alphabetic sorting prevents
15099                         # differences between releases
15100                         sort {  my $ext_a = $a->external_name;
15101                                 return 1 if ! defined $ext_a;
15102                                 my $ext_b = $b->external_name;
15103                                 return -1 if ! defined $ext_b;
15104
15105                                 # But return the non-complement table before
15106                                 # the complement one, as the latter is defined
15107                                 # in terms of the former, and needs to have
15108                                 # the information for the former available.
15109                                 return 1 if $a->complement != 0;
15110                                 return -1 if $b->complement != 0;
15111
15112                                 # Similarly, return a subservient table after
15113                                 # a leader
15114                                 return 1 if $a->leader != $a;
15115                                 return -1 if $b->leader != $b;
15116
15117                                 my $cmp = length $ext_a <=> length $ext_b;
15118
15119                                 # Return result if lengths not equal
15120                                 return $cmp if $cmp;
15121
15122                                 # Alphabetic if lengths equal
15123                                 return $ext_a cmp $ext_b
15124                         } $property->tables
15125                     )
15126         {
15127
15128             # Here we have a table associated with a property.  It could be
15129             # the map table (done first for each property), or one of the
15130             # other tables.  Determine which type.
15131             my $is_property = $table->isa('Property');
15132
15133             my $name = $table->name;
15134             my $complete_name = $table->complete_name;
15135
15136             # See if should suppress the table if is empty, but warn if it
15137             # contains something.
15138             my $suppress_if_empty_warn_if_not
15139                     = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
15140
15141             # Calculate if this table should have any code points associated
15142             # with it or not.
15143             my $expected_empty =
15144
15145                 # $perl should be empty, as well as properties that we just
15146                 # don't do anything with
15147                 ($is_property
15148                     && ($table == $perl
15149                         || grep { $complete_name eq $_ }
15150                                                     @unimplemented_properties
15151                     )
15152                 )
15153
15154                 # Match tables in properties we skipped populating should be
15155                 # empty
15156                 || (! $is_property && ! $property->to_create_match_tables)
15157
15158                 # Tables and properties that are expected to have no code
15159                 # points should be empty
15160                 || $suppress_if_empty_warn_if_not
15161             ;
15162
15163             # Set a boolean if this table is the complement of an empty binary
15164             # table
15165             my $is_complement_of_empty_binary =
15166                 $type == $BINARY &&
15167                 (($table == $property->table('Y')
15168                     && $property->table('N')->is_empty)
15169                 || ($table == $property->table('N')
15170                     && $property->table('Y')->is_empty));
15171
15172             if ($table->is_empty) {
15173
15174                 if ($suppress_if_empty_warn_if_not) {
15175                     $table->set_fate($SUPPRESSED,
15176                                      $suppress_if_empty_warn_if_not);
15177                 }
15178
15179                 # Suppress (by skipping them) expected empty tables.
15180                 next TABLE if $expected_empty;
15181
15182                 # And setup to later output a warning for those that aren't
15183                 # known to be allowed to be empty.  Don't do the warning if
15184                 # this table is a child of another one to avoid duplicating
15185                 # the warning that should come from the parent one.
15186                 if (($table == $property || $table->parent == $table)
15187                     && $table->fate != $SUPPRESSED
15188                     && $table->fate != $MAP_PROXIED
15189                     && ! grep { $complete_name =~ /^$_$/ }
15190                                                     @tables_that_may_be_empty)
15191                 {
15192                     push @unhandled_properties, "$table";
15193                 }
15194
15195                 # An empty table is just the complement of everything.
15196                 $table->set_complement($Any) if $table != $property;
15197             }
15198             elsif ($expected_empty) {
15199                 my $because = "";
15200                 if ($suppress_if_empty_warn_if_not) {
15201                     $because = " because $suppress_if_empty_warn_if_not";
15202                 }
15203
15204                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
15205             }
15206
15207             # Some tables should match everything
15208             my $expected_full =
15209                 ($table->fate == $SUPPRESSED)
15210                 ? 0
15211                 : ($is_property)
15212                   ? # All these types of map tables will be full because
15213                     # they will have been populated with defaults
15214                     ($type == $ENUM || $type == $FORCED_BINARY)
15215
15216                   : # A match table should match everything if its method
15217                     # shows it should
15218                     ($table->matches_all
15219
15220                     # The complement of an empty binary table will match
15221                     # everything
15222                     || $is_complement_of_empty_binary
15223                     )
15224             ;
15225
15226             my $count = $table->count;
15227             if ($expected_full) {
15228                 if ($count != $MAX_UNICODE_CODEPOINTS) {
15229                     Carp::my_carp("$table matches only "
15230                     . clarify_number($count)
15231                     . " Unicode code points but should match "
15232                     . clarify_number($MAX_UNICODE_CODEPOINTS)
15233                     . " (off by "
15234                     .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
15235                     . ").  Proceeding anyway.");
15236                 }
15237
15238                 # Here is expected to be full.  If it is because it is the
15239                 # complement of an (empty) binary table that is to be
15240                 # suppressed, then suppress this one as well.
15241                 if ($is_complement_of_empty_binary) {
15242                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
15243                     my $opposing = $property->table($opposing_name);
15244                     my $opposing_status = $opposing->status;
15245                     if ($opposing_status) {
15246                         $table->set_status($opposing_status,
15247                                            $opposing->status_info);
15248                     }
15249                 }
15250             }
15251             elsif ($count == $MAX_UNICODE_CODEPOINTS) {
15252                 if ($table == $property || $table->leader == $table) {
15253                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
15254                 }
15255             }
15256
15257             if ($table->fate == $SUPPRESSED) {
15258                 if (! $is_property) {
15259                     my @children = $table->children;
15260                     foreach my $child (@children) {
15261                         if ($child->fate != $SUPPRESSED) {
15262                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
15263                         }
15264                     }
15265                 }
15266                 next TABLE;
15267
15268             }
15269
15270             if (! $is_property) {
15271
15272                 make_ucd_table_pod_entries($table) if $table->property == $perl;
15273
15274                 # Several things need to be done just once for each related
15275                 # group of match tables.  Do them on the parent.
15276                 if ($table->parent == $table) {
15277
15278                     # Add an entry in the pod file for the table; it also does
15279                     # the children.
15280                     make_re_pod_entries($table) if defined $pod_directory;
15281
15282                     # See if the the table matches identical code points with
15283                     # something that has already been output.  In that case,
15284                     # no need to have two files with the same code points in
15285                     # them.  We use the table's hash() method to store these
15286                     # in buckets, so that it is quite likely that if two
15287                     # tables are in the same bucket they will be identical, so
15288                     # don't have to compare tables frequently.  The tables
15289                     # have to have the same status to share a file, so add
15290                     # this to the bucket hash.  (The reason for this latter is
15291                     # that Heavy.pl associates a status with a file.)
15292                     # We don't check tables that are inverses of others, as it
15293                     # would lead to some coding complications, and checking
15294                     # all the regular ones should find everything.
15295                     if ($table->complement == 0) {
15296                         my $hash = $table->hash . ';' . $table->status;
15297
15298                         # Look at each table that is in the same bucket as
15299                         # this one would be.
15300                         foreach my $comparison
15301                                             (@{$match_tables_to_write{$hash}})
15302                         {
15303                             if ($table->matches_identically_to($comparison)) {
15304                                 $table->set_equivalent_to($comparison,
15305                                                                 Related => 0);
15306                                 next TABLE;
15307                             }
15308                         }
15309
15310                         # Here, not equivalent, add this table to the bucket.
15311                         push @{$match_tables_to_write{$hash}}, $table;
15312                     }
15313                 }
15314             }
15315             else {
15316
15317                 # Here is the property itself.
15318                 # Don't write out or make references to the $perl property
15319                 next if $table == $perl;
15320
15321                 make_ucd_table_pod_entries($table);
15322
15323                 # There is a mapping stored of the various synonyms to the
15324                 # standardized name of the property for utf8_heavy.pl.
15325                 # Also, the pod file contains entries of the form:
15326                 # \p{alias: *}         \p{full: *}
15327                 # rather than show every possible combination of things.
15328
15329                 my @property_aliases = $property->aliases;
15330
15331                 my $full_property_name = $property->full_name;
15332                 my $property_name = $property->name;
15333                 my $standard_property_name = standardize($property_name);
15334                 my $standard_property_full_name
15335                                         = standardize($full_property_name);
15336
15337                 # We also create for Unicode::UCD a list of aliases for
15338                 # the property.  The list starts with the property name;
15339                 # then its full name.
15340                 my @property_list;
15341                 my @standard_list;
15342                 if ( $property->fate <= $MAP_PROXIED) {
15343                     @property_list = ($property_name, $full_property_name);
15344                     @standard_list = ($standard_property_name,
15345                                         $standard_property_full_name);
15346                 }
15347
15348                 # For each synonym ...
15349                 for my $i (0 .. @property_aliases - 1)  {
15350                     my $alias = $property_aliases[$i];
15351                     my $alias_name = $alias->name;
15352                     my $alias_standard = standardize($alias_name);
15353
15354
15355                     # Add other aliases to the list of property aliases
15356                     if ($property->fate <= $MAP_PROXIED
15357                         && ! grep { $alias_standard eq $_ } @standard_list)
15358                     {
15359                         push @property_list, $alias_name;
15360                         push @standard_list, $alias_standard;
15361                     }
15362
15363                     # For utf8_heavy, set the mapping of the alias to the
15364                     # property
15365                     if ($type == $STRING) {
15366                         if ($property->fate <= $MAP_PROXIED) {
15367                             $string_property_loose_to_name{$alias_standard}
15368                                             = $standard_property_name;
15369                         }
15370                     }
15371                     else {
15372                         if (exists ($loose_property_name_of{$alias_standard}))
15373                         {
15374                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
15375                         }
15376                         else {
15377                             $loose_property_name_of{$alias_standard}
15378                                                 = $standard_property_name;
15379                         }
15380
15381                         # Now for the re pod entry for this alias.  Skip if not
15382                         # outputting a pod; skip the first one, which is the
15383                         # full name so won't have an entry like: '\p{full: *}
15384                         # \p{full: *}', and skip if don't want an entry for
15385                         # this one.
15386                         next if $i == 0
15387                                 || ! defined $pod_directory
15388                                 || ! $alias->make_re_pod_entry;
15389
15390                         my $rhs = "\\p{$full_property_name: *}";
15391                         if ($property != $perl && $table->perl_extension) {
15392                             $rhs .= ' (Perl extension)';
15393                         }
15394                         push @match_properties,
15395                             format_pod_line($indent_info_column,
15396                                         '\p{' . $alias->name . ': *}',
15397                                         $rhs,
15398                                         $alias->status);
15399                     }
15400                 }
15401
15402                 # The list of all possible names is attached to each alias, so
15403                 # lookup is easy
15404                 if (@property_list) {
15405                     push @{$prop_aliases{$standard_list[0]}}, @property_list;
15406                 }
15407
15408                 if ($property->fate <= $MAP_PROXIED) {
15409
15410                     # Similarly, we create for Unicode::UCD a list of
15411                     # property-value aliases.
15412
15413                     my $property_full_name = $property->full_name;
15414
15415                     # Look at each table in the property...
15416                     foreach my $table ($property->tables) {
15417                         my @values_list;
15418                         my $table_full_name = $table->full_name;
15419                         my $standard_table_full_name
15420                                               = standardize($table_full_name);
15421                         my $table_name = $table->name;
15422                         my $standard_table_name = standardize($table_name);
15423
15424                         # The list starts with the table name and its full
15425                         # name.
15426                         push @values_list, $table_name, $table_full_name;
15427
15428                         # We add to the table each unique alias that isn't
15429                         # discouraged from use.
15430                         foreach my $alias ($table->aliases) {
15431                             next if $alias->status
15432                                  && $alias->status eq $DISCOURAGED;
15433                             my $name = $alias->name;
15434                             my $standard = standardize($name);
15435                             next if $standard eq $standard_table_name;
15436                             next if $standard eq $standard_table_full_name;
15437                             push @values_list, $name;
15438                         }
15439
15440                         # Here @values_list is a list of all the aliases for
15441                         # the table.  That is, all the property-values given
15442                         # by this table.  By agreement with Unicode::UCD,
15443                         # if the name and full name are identical, and there
15444                         # are no other names, drop the duplcate entry to save
15445                         # memory.
15446                         if (@values_list == 2
15447                             && $values_list[0] eq $values_list[1])
15448                         {
15449                             pop @values_list
15450                         }
15451
15452                         # To save memory, unlike the similar list for property
15453                         # aliases above, only the standard forms hve the list.
15454                         # This forces an extra step of converting from input
15455                         # name to standard name, but the savings are
15456                         # considerable.  (There is only marginal savings if we
15457                         # did this with the property aliases.)
15458                         push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
15459                     }
15460                 }
15461
15462                 # Don't write out a mapping file if not desired.
15463                 next if ! $property->to_output_map;
15464             }
15465
15466             # Here, we know we want to write out the table, but don't do it
15467             # yet because there may be other tables that come along and will
15468             # want to share the file, and the file's comments will change to
15469             # mention them.  So save for later.
15470             push @writables, $table;
15471
15472         } # End of looping through the property and all its tables.
15473     } # End of looping through all properties.
15474
15475     # Now have all the tables that will have files written for them.  Do it.
15476     foreach my $table (@writables) {
15477         my @directory;
15478         my $filename;
15479         my $property = $table->property;
15480         my $is_property = ($table == $property);
15481         if (! $is_property) {
15482
15483             # Match tables for the property go in lib/$subdirectory, which is
15484             # the property's name.  Don't use the standard file name for this,
15485             # as may get an unfamiliar alias
15486             @directory = ($matches_directory, $property->external_name);
15487         }
15488         else {
15489
15490             @directory = $table->directory;
15491             $filename = $table->file;
15492         }
15493
15494         # Use specified filename if available, or default to property's
15495         # shortest name.  We need an 8.3 safe filename (which means "an 8
15496         # safe" filename, since after the dot is only 'pl', which is < 3)
15497         # The 2nd parameter is if the filename shouldn't be changed, and
15498         # it shouldn't iff there is a hard-coded name for this table.
15499         $filename = construct_filename(
15500                                 $filename || $table->external_name,
15501                                 ! $filename,    # mutable if no filename
15502                                 \@directory);
15503
15504         register_file_for_name($table, \@directory, $filename);
15505
15506         # Only need to write one file when shared by more than one
15507         # property
15508         next if ! $is_property
15509                 && ($table->leader != $table || $table->complement != 0);
15510
15511         # Construct a nice comment to add to the file
15512         $table->set_final_comment;
15513
15514         $table->write;
15515     }
15516
15517
15518     # Write out the pod file
15519     make_pod;
15520
15521     # And Heavy.pl, Name.pm, UCD.pl
15522     make_Heavy;
15523     make_Name_pm;
15524     make_UCD;
15525
15526     make_property_test_script() if $make_test_script;
15527     return;
15528 }
15529
15530 my @white_space_separators = ( # This used only for making the test script.
15531                             "",
15532                             ' ',
15533                             "\t",
15534                             '   '
15535                         );
15536
15537 sub generate_separator($) {
15538     # This used only for making the test script.  It generates the colon or
15539     # equal separator between the property and property value, with random
15540     # white space surrounding the separator
15541
15542     my $lhs = shift;
15543
15544     return "" if $lhs eq "";  # No separator if there's only one (the r) side
15545
15546     # Choose space before and after randomly
15547     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
15548     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
15549
15550     # And return the whole complex, half the time using a colon, half the
15551     # equals
15552     return $spaces_before
15553             . (rand() < 0.5) ? '=' : ':'
15554             . $spaces_after;
15555 }
15556
15557 sub generate_tests($$$$$) {
15558     # This used only for making the test script.  It generates test cases that
15559     # are expected to compile successfully in perl.  Note that the lhs and
15560     # rhs are assumed to already be as randomized as the caller wants.
15561
15562     my $lhs = shift;           # The property: what's to the left of the colon
15563                                #  or equals separator
15564     my $rhs = shift;           # The property value; what's to the right
15565     my $valid_code = shift;    # A code point that's known to be in the
15566                                # table given by lhs=rhs; undef if table is
15567                                # empty
15568     my $invalid_code = shift;  # A code point known to not be in the table;
15569                                # undef if the table is all code points
15570     my $warning = shift;
15571
15572     # Get the colon or equal
15573     my $separator = generate_separator($lhs);
15574
15575     # The whole 'property=value'
15576     my $name = "$lhs$separator$rhs";
15577
15578     my @output;
15579     # Create a complete set of tests, with complements.
15580     if (defined $valid_code) {
15581         push @output, <<"EOC"
15582 Expect(1, $valid_code, '\\p{$name}', $warning);
15583 Expect(0, $valid_code, '\\p{^$name}', $warning);
15584 Expect(0, $valid_code, '\\P{$name}', $warning);
15585 Expect(1, $valid_code, '\\P{^$name}', $warning);
15586 EOC
15587     }
15588     if (defined $invalid_code) {
15589         push @output, <<"EOC"
15590 Expect(0, $invalid_code, '\\p{$name}', $warning);
15591 Expect(1, $invalid_code, '\\p{^$name}', $warning);
15592 Expect(1, $invalid_code, '\\P{$name}', $warning);
15593 Expect(0, $invalid_code, '\\P{^$name}', $warning);
15594 EOC
15595     }
15596     return @output;
15597 }
15598
15599 sub generate_error($$$) {
15600     # This used only for making the test script.  It generates test cases that
15601     # are expected to not only not match, but to be syntax or similar errors
15602
15603     my $lhs = shift;                # The property: what's to the left of the
15604                                     # colon or equals separator
15605     my $rhs = shift;                # The property value; what's to the right
15606     my $already_in_error = shift;   # Boolean; if true it's known that the
15607                                 # unmodified lhs and rhs will cause an error.
15608                                 # This routine should not force another one
15609     # Get the colon or equal
15610     my $separator = generate_separator($lhs);
15611
15612     # Since this is an error only, don't bother to randomly decide whether to
15613     # put the error on the left or right side; and assume that the rhs is
15614     # loosely matched, again for convenience rather than rigor.
15615     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
15616
15617     my $property = $lhs . $separator . $rhs;
15618
15619     return <<"EOC";
15620 Error('\\p{$property}');
15621 Error('\\P{$property}');
15622 EOC
15623 }
15624
15625 # These are used only for making the test script
15626 # XXX Maybe should also have a bad strict seps, which includes underscore.
15627
15628 my @good_loose_seps = (
15629             " ",
15630             "-",
15631             "\t",
15632             "",
15633             "_",
15634            );
15635 my @bad_loose_seps = (
15636            "/a/",
15637            ':=',
15638           );
15639
15640 sub randomize_stricter_name {
15641     # This used only for making the test script.  Take the input name and
15642     # return a randomized, but valid version of it under the stricter matching
15643     # rules.
15644
15645     my $name = shift;
15646     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15647
15648     # If the name looks like a number (integer, floating, or rational), do
15649     # some extra work
15650     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
15651         my $sign = $1;
15652         my $number = $2;
15653         my $separator = $3;
15654
15655         # If there isn't a sign, part of the time add a plus
15656         # Note: Not testing having any denominator having a minus sign
15657         if (! $sign) {
15658             $sign = '+' if rand() <= .3;
15659         }
15660
15661         # And add 0 or more leading zeros.
15662         $name = $sign . ('0' x int rand(10)) . $number;
15663
15664         if (defined $separator) {
15665             my $extra_zeros = '0' x int rand(10);
15666
15667             if ($separator eq '.') {
15668
15669                 # Similarly, add 0 or more trailing zeros after a decimal
15670                 # point
15671                 $name .= $extra_zeros;
15672             }
15673             else {
15674
15675                 # Or, leading zeros before the denominator
15676                 $name =~ s,/,/$extra_zeros,;
15677             }
15678         }
15679     }
15680
15681     # For legibility of the test, only change the case of whole sections at a
15682     # time.  To do this, first split into sections.  The split returns the
15683     # delimiters
15684     my @sections;
15685     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
15686         trace $section if main::DEBUG && $to_trace;
15687
15688         if (length $section > 1 && $section !~ /\D/) {
15689
15690             # If the section is a sequence of digits, about half the time
15691             # randomly add underscores between some of them.
15692             if (rand() > .5) {
15693
15694                 # Figure out how many underscores to add.  max is 1 less than
15695                 # the number of digits.  (But add 1 at the end to make sure
15696                 # result isn't 0, and compensate earlier by subtracting 2
15697                 # instead of 1)
15698                 my $num_underscores = int rand(length($section) - 2) + 1;
15699
15700                 # And add them evenly throughout, for convenience, not rigor
15701                 use integer;
15702                 my $spacing = (length($section) - 1)/ $num_underscores;
15703                 my $temp = $section;
15704                 $section = "";
15705                 for my $i (1 .. $num_underscores) {
15706                     $section .= substr($temp, 0, $spacing, "") . '_';
15707                 }
15708                 $section .= $temp;
15709             }
15710             push @sections, $section;
15711         }
15712         else {
15713
15714             # Here not a sequence of digits.  Change the case of the section
15715             # randomly
15716             my $switch = int rand(4);
15717             if ($switch == 0) {
15718                 push @sections, uc $section;
15719             }
15720             elsif ($switch == 1) {
15721                 push @sections, lc $section;
15722             }
15723             elsif ($switch == 2) {
15724                 push @sections, ucfirst $section;
15725             }
15726             else {
15727                 push @sections, $section;
15728             }
15729         }
15730     }
15731     trace "returning", join "", @sections if main::DEBUG && $to_trace;
15732     return join "", @sections;
15733 }
15734
15735 sub randomize_loose_name($;$) {
15736     # This used only for making the test script
15737
15738     my $name = shift;
15739     my $want_error = shift;  # if true, make an error
15740     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15741
15742     $name = randomize_stricter_name($name);
15743
15744     my @parts;
15745     push @parts, $good_loose_seps[rand(@good_loose_seps)];
15746
15747     # Preserve trailing ones for the sake of not stripping the underscore from
15748     # 'L_'
15749     for my $part (split /[-\s_]+ (?= . )/, $name) {
15750         if (@parts) {
15751             if ($want_error and rand() < 0.3) {
15752                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
15753                 $want_error = 0;
15754             }
15755             else {
15756                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
15757             }
15758         }
15759         push @parts, $part;
15760     }
15761     my $new = join("", @parts);
15762     trace "$name => $new" if main::DEBUG && $to_trace;
15763
15764     if ($want_error) {
15765         if (rand() >= 0.5) {
15766             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
15767         }
15768         else {
15769             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
15770         }
15771     }
15772     return $new;
15773 }
15774
15775 # Used to make sure don't generate duplicate test cases.
15776 my %test_generated;
15777
15778 sub make_property_test_script() {
15779     # This used only for making the test script
15780     # this written directly -- it's huge.
15781
15782     print "Making test script\n" if $verbosity >= $PROGRESS;
15783
15784     # This uses randomness to test different possibilities without testing all
15785     # possibilities.  To ensure repeatability, set the seed to 0.  But if
15786     # tests are added, it will perturb all later ones in the .t file
15787     srand 0;
15788
15789     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
15790
15791     # Keep going down an order of magnitude
15792     # until find that adding this quantity to
15793     # 1 remains 1; but put an upper limit on
15794     # this so in case this algorithm doesn't
15795     # work properly on some platform, that we
15796     # won't loop forever.
15797     my $digits = 0;
15798     my $min_floating_slop = 1;
15799     while (1+ $min_floating_slop != 1
15800             && $digits++ < 50)
15801     {
15802         my $next = $min_floating_slop / 10;
15803         last if $next == 0; # If underflows,
15804                             # use previous one
15805         $min_floating_slop = $next;
15806     }
15807
15808     # It doesn't matter whether the elements of this array contain single lines
15809     # or multiple lines. main::write doesn't count the lines.
15810     my @output;
15811
15812     foreach my $property (property_ref('*')) {
15813         foreach my $table ($property->tables) {
15814
15815             # Find code points that match, and don't match this table.
15816             my $valid = $table->get_valid_code_point;
15817             my $invalid = $table->get_invalid_code_point;
15818             my $warning = ($table->status eq $DEPRECATED)
15819                             ? "'deprecated'"
15820                             : '""';
15821
15822             # Test each possible combination of the property's aliases with
15823             # the table's.  If this gets to be too many, could do what is done
15824             # in the set_final_comment() for Tables
15825             my @table_aliases = $table->aliases;
15826             my @property_aliases = $table->property->aliases;
15827
15828             # Every property can be optionally be prefixed by 'Is_', so test
15829             # that those work, by creating such a new alias for each
15830             # pre-existing one.
15831             push @property_aliases, map { Alias->new("Is_" . $_->name,
15832                                                     $_->loose_match,
15833                                                     $_->make_re_pod_entry,
15834                                                     $_->ok_as_filename,
15835                                                     $_->status,
15836                                                     $_->ucd,
15837                                                     )
15838                                          } @property_aliases;
15839             my $max = max(scalar @table_aliases, scalar @property_aliases);
15840             for my $j (0 .. $max - 1) {
15841
15842                 # The current alias for property is the next one on the list,
15843                 # or if beyond the end, start over.  Similarly for table
15844                 my $property_name
15845                             = $property_aliases[$j % @property_aliases]->name;
15846
15847                 $property_name = "" if $table->property == $perl;
15848                 my $table_alias = $table_aliases[$j % @table_aliases];
15849                 my $table_name = $table_alias->name;
15850                 my $loose_match = $table_alias->loose_match;
15851
15852                 # If the table doesn't have a file, any test for it is
15853                 # already guaranteed to be in error
15854                 my $already_error = ! $table->file_path;
15855
15856                 # Generate error cases for this alias.
15857                 push @output, generate_error($property_name,
15858                                              $table_name,
15859                                              $already_error);
15860
15861                 # If the table is guaranteed to always generate an error,
15862                 # quit now without generating success cases.
15863                 next if $already_error;
15864
15865                 # Now for the success cases.
15866                 my $random;
15867                 if ($loose_match) {
15868
15869                     # For loose matching, create an extra test case for the
15870                     # standard name.
15871                     my $standard = standardize($table_name);
15872
15873                     # $test_name should be a unique combination for each test
15874                     # case; used just to avoid duplicate tests
15875                     my $test_name = "$property_name=$standard";
15876
15877                     # Don't output duplicate test cases.
15878                     if (! exists $test_generated{$test_name}) {
15879                         $test_generated{$test_name} = 1;
15880                         push @output, generate_tests($property_name,
15881                                                      $standard,
15882                                                      $valid,
15883                                                      $invalid,
15884                                                      $warning,
15885                                                  );
15886                     }
15887                     $random = randomize_loose_name($table_name)
15888                 }
15889                 else { # Stricter match
15890                     $random = randomize_stricter_name($table_name);
15891                 }
15892
15893                 # Now for the main test case for this alias.
15894                 my $test_name = "$property_name=$random";
15895                 if (! exists $test_generated{$test_name}) {
15896                     $test_generated{$test_name} = 1;
15897                     push @output, generate_tests($property_name,
15898                                                  $random,
15899                                                  $valid,
15900                                                  $invalid,
15901                                                  $warning,
15902                                              );
15903
15904                     # If the name is a rational number, add tests for the
15905                     # floating point equivalent.
15906                     if ($table_name =~ qr{/}) {
15907
15908                         # Calculate the float, and find just the fraction.
15909                         my $float = eval $table_name;
15910                         my ($whole, $fraction)
15911                                             = $float =~ / (.*) \. (.*) /x;
15912
15913                         # Starting with one digit after the decimal point,
15914                         # create a test for each possible precision (number of
15915                         # digits past the decimal point) until well beyond the
15916                         # native number found on this machine.  (If we started
15917                         # with 0 digits, it would be an integer, which could
15918                         # well match an unrelated table)
15919                         PLACE:
15920                         for my $i (1 .. $min_floating_slop + 3) {
15921                             my $table_name = sprintf("%.*f", $i, $float);
15922                             if ($i < $MIN_FRACTION_LENGTH) {
15923
15924                                 # If the test case has fewer digits than the
15925                                 # minimum acceptable precision, it shouldn't
15926                                 # succeed, so we expect an error for it.
15927                                 # E.g., 2/3 = .7 at one decimal point, and we
15928                                 # shouldn't say it matches .7.  We should make
15929                                 # it be .667 at least before agreeing that the
15930                                 # intent was to match 2/3.  But at the
15931                                 # less-than- acceptable level of precision, it
15932                                 # might actually match an unrelated number.
15933                                 # So don't generate a test case if this
15934                                 # conflating is possible.  In our example, we
15935                                 # don't want 2/3 matching 7/10, if there is
15936                                 # a 7/10 code point.
15937                                 for my $existing
15938                                         (keys %nv_floating_to_rational)
15939                                 {
15940                                     next PLACE
15941                                         if abs($table_name - $existing)
15942                                                 < $MAX_FLOATING_SLOP;
15943                                 }
15944                                 push @output, generate_error($property_name,
15945                                                              $table_name,
15946                                                              1   # 1 => already an error
15947                                               );
15948                             }
15949                             else {
15950
15951                                 # Here the number of digits exceeds the
15952                                 # minimum we think is needed.  So generate a
15953                                 # success test case for it.
15954                                 push @output, generate_tests($property_name,
15955                                                              $table_name,
15956                                                              $valid,
15957                                                              $invalid,
15958                                                              $warning,
15959                                              );
15960                             }
15961                         }
15962                     }
15963                 }
15964             }
15965         }
15966     }
15967
15968     &write($t_path,
15969            0,           # Not utf8;
15970            [<DATA>,
15971             @output,
15972             (map {"Test_X('$_');\n"} @backslash_X_tests),
15973             "Finished();\n"]);
15974     return;
15975 }
15976
15977 # This is a list of the input files and how to handle them.  The files are
15978 # processed in their order in this list.  Some reordering is possible if
15979 # desired, but the v0 files should be first, and the extracted before the
15980 # others except DAge.txt (as data in an extracted file can be over-ridden by
15981 # the non-extracted.  Some other files depend on data derived from an earlier
15982 # file, like UnicodeData requires data from Jamo, and the case changing and
15983 # folding requires data from Unicode.  Mostly, it is safest to order by first
15984 # version releases in (except the Jamo).  DAge.txt is read before the
15985 # extracted ones because of the rarely used feature $compare_versions.  In the
15986 # unlikely event that there were ever an extracted file that contained the Age
15987 # property information, it would have to go in front of DAge.
15988 #
15989 # The version strings allow the program to know whether to expect a file or
15990 # not, but if a file exists in the directory, it will be processed, even if it
15991 # is in a version earlier than expected, so you can copy files from a later
15992 # release into an earlier release's directory.
15993 my @input_file_objects = (
15994     Input_file->new('PropertyAliases.txt', v0,
15995                     Handler => \&process_PropertyAliases,
15996                     ),
15997     Input_file->new(undef, v0,  # No file associated with this
15998                     Progress_Message => 'Finishing property setup',
15999                     Handler => \&finish_property_setup,
16000                     ),
16001     Input_file->new('PropValueAliases.txt', v0,
16002                      Handler => \&process_PropValueAliases,
16003                      Has_Missings_Defaults => $NOT_IGNORED,
16004                      ),
16005     Input_file->new('DAge.txt', v3.2.0,
16006                     Has_Missings_Defaults => $NOT_IGNORED,
16007                     Property => 'Age'
16008                     ),
16009     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
16010                     Property => 'General_Category',
16011                     ),
16012     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
16013                     Property => 'Canonical_Combining_Class',
16014                     Has_Missings_Defaults => $NOT_IGNORED,
16015                     ),
16016     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
16017                     Property => 'Numeric_Type',
16018                     Has_Missings_Defaults => $NOT_IGNORED,
16019                     ),
16020     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
16021                     Property => 'East_Asian_Width',
16022                     Has_Missings_Defaults => $NOT_IGNORED,
16023                     ),
16024     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
16025                     Property => 'Line_Break',
16026                     Has_Missings_Defaults => $NOT_IGNORED,
16027                     ),
16028     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
16029                     Property => 'Bidi_Class',
16030                     Has_Missings_Defaults => $NOT_IGNORED,
16031                     ),
16032     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
16033                     Property => 'Decomposition_Type',
16034                     Has_Missings_Defaults => $NOT_IGNORED,
16035                     ),
16036     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
16037     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
16038                     Property => 'Numeric_Value',
16039                     Each_Line_Handler => \&filter_numeric_value_line,
16040                     Has_Missings_Defaults => $NOT_IGNORED,
16041                     ),
16042     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
16043                     Property => 'Joining_Group',
16044                     Has_Missings_Defaults => $NOT_IGNORED,
16045                     ),
16046
16047     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
16048                     Property => 'Joining_Type',
16049                     Has_Missings_Defaults => $NOT_IGNORED,
16050                     ),
16051     Input_file->new('Jamo.txt', v2.0.0,
16052                     Property => 'Jamo_Short_Name',
16053                     Each_Line_Handler => \&filter_jamo_line,
16054                     ),
16055     Input_file->new('UnicodeData.txt', v1.1.5,
16056                     Pre_Handler => \&setup_UnicodeData,
16057
16058                     # We clean up this file for some early versions.
16059                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
16060                                             ? \&filter_v1_ucd
16061                                             : ($v_version eq v2.1.5)
16062                                                 ? \&filter_v2_1_5_ucd
16063
16064                                                 # And for 5.14 Perls with 6.0,
16065                                                 # have to also make changes
16066                                                 : ($v_version ge v6.0.0)
16067                                                     ? \&filter_v6_ucd
16068                                                     : undef),
16069
16070                                             # And the main filter
16071                                             \&filter_UnicodeData_line,
16072                                          ],
16073                     EOF_Handler => \&EOF_UnicodeData,
16074                     ),
16075     Input_file->new('ArabicShaping.txt', v2.0.0,
16076                     Each_Line_Handler =>
16077                         [ ($v_version lt 4.1.0)
16078                                     ? \&filter_old_style_arabic_shaping
16079                                     : undef,
16080                         \&filter_arabic_shaping_line,
16081                         ],
16082                     Has_Missings_Defaults => $NOT_IGNORED,
16083                     ),
16084     Input_file->new('Blocks.txt', v2.0.0,
16085                     Property => 'Block',
16086                     Has_Missings_Defaults => $NOT_IGNORED,
16087                     Each_Line_Handler => \&filter_blocks_lines
16088                     ),
16089     Input_file->new('PropList.txt', v2.0.0,
16090                     Each_Line_Handler => (($v_version lt v3.1.0)
16091                                             ? \&filter_old_style_proplist
16092                                             : undef),
16093                     ),
16094     Input_file->new('Unihan.txt', v2.0.0,
16095                     Pre_Handler => \&setup_unihan,
16096                     Optional => 1,
16097                     Each_Line_Handler => \&filter_unihan_line,
16098                         ),
16099     Input_file->new('SpecialCasing.txt', v2.1.8,
16100                     Each_Line_Handler => \&filter_special_casing_line,
16101                     Pre_Handler => \&setup_special_casing,
16102                     Has_Missings_Defaults => $IGNORED,
16103                     ),
16104     Input_file->new(
16105                     'LineBreak.txt', v3.0.0,
16106                     Has_Missings_Defaults => $NOT_IGNORED,
16107                     Property => 'Line_Break',
16108                     # Early versions had problematic syntax
16109                     Each_Line_Handler => (($v_version lt v3.1.0)
16110                                         ? \&filter_early_ea_lb
16111                                         : undef),
16112                     ),
16113     Input_file->new('EastAsianWidth.txt', v3.0.0,
16114                     Property => 'East_Asian_Width',
16115                     Has_Missings_Defaults => $NOT_IGNORED,
16116                     # Early versions had problematic syntax
16117                     Each_Line_Handler => (($v_version lt v3.1.0)
16118                                         ? \&filter_early_ea_lb
16119                                         : undef),
16120                     ),
16121     Input_file->new('CompositionExclusions.txt', v3.0.0,
16122                     Property => 'Composition_Exclusion',
16123                     ),
16124     Input_file->new('BidiMirroring.txt', v3.0.1,
16125                     Property => 'Bidi_Mirroring_Glyph',
16126                     ),
16127     Input_file->new("NormalizationTest.txt", v3.0.1,
16128                     Skip => 'Validation Tests',
16129                     ),
16130     Input_file->new('CaseFolding.txt', v3.0.1,
16131                     Pre_Handler => \&setup_case_folding,
16132                     Each_Line_Handler =>
16133                         [ ($v_version lt v3.1.0)
16134                                  ? \&filter_old_style_case_folding
16135                                  : undef,
16136                            \&filter_case_folding_line
16137                         ],
16138                     Has_Missings_Defaults => $IGNORED,
16139                     ),
16140     Input_file->new('DCoreProperties.txt', v3.1.0,
16141                     # 5.2 changed this file
16142                     Has_Missings_Defaults => (($v_version ge v5.2.0)
16143                                             ? $NOT_IGNORED
16144                                             : $NO_DEFAULTS),
16145                     ),
16146     Input_file->new('Scripts.txt', v3.1.0,
16147                     Property => 'Script',
16148                     Has_Missings_Defaults => $NOT_IGNORED,
16149                     ),
16150     Input_file->new('DNormalizationProps.txt', v3.1.0,
16151                     Has_Missings_Defaults => $NOT_IGNORED,
16152                     Each_Line_Handler => (($v_version lt v4.0.1)
16153                                       ? \&filter_old_style_normalization_lines
16154                                       : undef),
16155                     ),
16156     Input_file->new('HangulSyllableType.txt', v4.0.0,
16157                     Has_Missings_Defaults => $NOT_IGNORED,
16158                     Property => 'Hangul_Syllable_Type'),
16159     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
16160                     Property => 'Word_Break',
16161                     Has_Missings_Defaults => $NOT_IGNORED,
16162                     ),
16163     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
16164                     Property => 'Grapheme_Cluster_Break',
16165                     Has_Missings_Defaults => $NOT_IGNORED,
16166                     ),
16167     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
16168                     Handler => \&process_GCB_test,
16169                     ),
16170     Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
16171                     Skip => 'Validation Tests',
16172                     ),
16173     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
16174                     Skip => 'Validation Tests',
16175                     ),
16176     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
16177                     Skip => 'Validation Tests',
16178                     ),
16179     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
16180                     Property => 'Sentence_Break',
16181                     Has_Missings_Defaults => $NOT_IGNORED,
16182                     ),
16183     Input_file->new('NamedSequences.txt', v4.1.0,
16184                     Handler => \&process_NamedSequences
16185                     ),
16186     Input_file->new('NameAliases.txt', v5.0.0,
16187                     Property => 'Name_Alias',
16188                     Pre_Handler => ($v_version le v6.0.0)
16189                                    ? \&setup_early_name_alias
16190                                    : undef,
16191                     Each_Line_Handler => ($v_version le v6.0.0)
16192                                    ? \&filter_early_version_name_alias_line
16193                                    : \&filter_later_version_name_alias_line,
16194                     ),
16195     Input_file->new("BidiTest.txt", v5.2.0,
16196                     Skip => 'Validation Tests',
16197                     ),
16198     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
16199                     Optional => 1,
16200                     Each_Line_Handler => \&filter_unihan_line,
16201                     ),
16202     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
16203                     Optional => 1,
16204                     Each_Line_Handler => \&filter_unihan_line,
16205                     ),
16206     Input_file->new('UnihanIRGSources.txt', v5.2.0,
16207                     Optional => 1,
16208                     Pre_Handler => \&setup_unihan,
16209                     Each_Line_Handler => \&filter_unihan_line,
16210                     ),
16211     Input_file->new('UnihanNumericValues.txt', v5.2.0,
16212                     Optional => 1,
16213                     Each_Line_Handler => \&filter_unihan_line,
16214                     ),
16215     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
16216                     Optional => 1,
16217                     Each_Line_Handler => \&filter_unihan_line,
16218                     ),
16219     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
16220                     Optional => 1,
16221                     Each_Line_Handler => \&filter_unihan_line,
16222                     ),
16223     Input_file->new('UnihanReadings.txt', v5.2.0,
16224                     Optional => 1,
16225                     Each_Line_Handler => \&filter_unihan_line,
16226                     ),
16227     Input_file->new('UnihanVariants.txt', v5.2.0,
16228                     Optional => 1,
16229                     Each_Line_Handler => \&filter_unihan_line,
16230                     ),
16231     Input_file->new('ScriptExtensions.txt', v6.0.0,
16232                     Property => 'Script_Extensions',
16233                     Pre_Handler => \&setup_script_extensions,
16234                     Each_Line_Handler => \&filter_script_extensions_line,
16235                     Has_Missings_Defaults => (($v_version le v6.0.0)
16236                                             ? $NO_DEFAULTS
16237                                             : $IGNORED),
16238                     ),
16239     # The two Indic files are actually available starting in v6.0.0, but their
16240     # property values are missing from PropValueAliases.txt in that release,
16241     # so that further work would have to be done to get them to work properly
16242     # for that release.
16243     Input_file->new('IndicMatraCategory.txt', v6.1.0,
16244                     Property => 'Indic_Matra_Category',
16245                     Has_Missings_Defaults => $NOT_IGNORED,
16246                     Skip => "Provisional; for the analysis and processing of Indic scripts",
16247                     ),
16248     Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
16249                     Property => 'Indic_Syllabic_Category',
16250                     Has_Missings_Defaults => $NOT_IGNORED,
16251                     Skip => "Provisional; for the analysis and processing of Indic scripts",
16252                     ),
16253 );
16254
16255 # End of all the preliminaries.
16256 # Do it...
16257
16258 if ($compare_versions) {
16259     Carp::my_carp(<<END
16260 Warning.  \$compare_versions is set.  Output is not suitable for production
16261 END
16262     );
16263 }
16264
16265 # Put into %potential_files a list of all the files in the directory structure
16266 # that could be inputs to this program, excluding those that we should ignore.
16267 # Use absolute file names because it makes it easier across machine types.
16268 my @ignored_files_full_names = map { File::Spec->rel2abs(
16269                                      internal_file_to_platform($_))
16270                                 } keys %ignored_files;
16271 File::Find::find({
16272     wanted=>sub {
16273         return unless /\.txt$/i;  # Some platforms change the name's case
16274         my $full = lc(File::Spec->rel2abs($_));
16275         $potential_files{$full} = 1
16276                     if ! grep { $full eq lc($_) } @ignored_files_full_names;
16277         return;
16278     }
16279 }, File::Spec->curdir());
16280
16281 my @mktables_list_output_files;
16282 my $old_start_time = 0;
16283
16284 if (! -e $file_list) {
16285     print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
16286     $write_unchanged_files = 1;
16287 } elsif ($write_unchanged_files) {
16288     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
16289 }
16290 else {
16291     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
16292     my $file_handle;
16293     if (! open $file_handle, "<", $file_list) {
16294         Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
16295         $glob_list = 1;
16296     }
16297     else {
16298         my @input;
16299
16300         # Read and parse mktables.lst, placing the results from the first part
16301         # into @input, and the second part into @mktables_list_output_files
16302         for my $list ( \@input, \@mktables_list_output_files ) {
16303             while (<$file_handle>) {
16304                 s/^ \s+ | \s+ $//xg;
16305                 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
16306                     $old_start_time = $1;
16307                 }
16308                 next if /^ \s* (?: \# .* )? $/x;
16309                 last if /^ =+ $/x;
16310                 my ( $file ) = split /\t/;
16311                 push @$list, $file;
16312             }
16313             @$list = uniques(@$list);
16314             next;
16315         }
16316
16317         # Look through all the input files
16318         foreach my $input (@input) {
16319             next if $input eq 'version'; # Already have checked this.
16320
16321             # Ignore if doesn't exist.  The checking about whether we care or
16322             # not is done via the Input_file object.
16323             next if ! file_exists($input);
16324
16325             # The paths are stored with relative names, and with '/' as the
16326             # delimiter; convert to absolute on this machine
16327             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
16328             $potential_files{lc $full} = 1
16329                 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
16330         }
16331     }
16332
16333     close $file_handle;
16334 }
16335
16336 if ($glob_list) {
16337
16338     # Here wants to process all .txt files in the directory structure.
16339     # Convert them to full path names.  They are stored in the platform's
16340     # relative style
16341     my @known_files;
16342     foreach my $object (@input_file_objects) {
16343         my $file = $object->file;
16344         next unless defined $file;
16345         push @known_files, File::Spec->rel2abs($file);
16346     }
16347
16348     my @unknown_input_files;
16349     foreach my $file (keys %potential_files) {  # The keys are stored in lc
16350         next if grep { $file eq lc($_) } @known_files;
16351
16352         # Here, the file is unknown to us.  Get relative path name
16353         $file = File::Spec->abs2rel($file);
16354         push @unknown_input_files, $file;
16355
16356         # What will happen is we create a data structure for it, and add it to
16357         # the list of input files to process.  First get the subdirectories
16358         # into an array
16359         my (undef, $directories, undef) = File::Spec->splitpath($file);
16360         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
16361         my @directories = File::Spec->splitdir($directories);
16362
16363         # If the file isn't extracted (meaning none of the directories is the
16364         # extracted one), just add it to the end of the list of inputs.
16365         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
16366             push @input_file_objects, Input_file->new($file, v0);
16367         }
16368         else {
16369
16370             # Here, the file is extracted.  It needs to go ahead of most other
16371             # processing.  Search for the first input file that isn't a
16372             # special required property (that is, find one whose first_release
16373             # is non-0), and isn't extracted.  Also, the Age property file is
16374             # processed before the extracted ones, just in case
16375             # $compare_versions is set.
16376             for (my $i = 0; $i < @input_file_objects; $i++) {
16377                 if ($input_file_objects[$i]->first_released ne v0
16378                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
16379                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
16380                 {
16381                     splice @input_file_objects, $i, 0,
16382                                                 Input_file->new($file, v0);
16383                     last;
16384                 }
16385             }
16386
16387         }
16388     }
16389     if (@unknown_input_files) {
16390         print STDERR simple_fold(join_lines(<<END
16391
16392 The following files are unknown as to how to handle.  Assuming they are
16393 typical property files.  You'll know by later error messages if it worked or
16394 not:
16395 END
16396         ) . " " . join(", ", @unknown_input_files) . "\n\n");
16397     }
16398 } # End of looking through directory structure for more .txt files.
16399
16400 # Create the list of input files from the objects we have defined, plus
16401 # version
16402 my @input_files = 'version';
16403 foreach my $object (@input_file_objects) {
16404     my $file = $object->file;
16405     next if ! defined $file;    # Not all objects have files
16406     next if $object->optional && ! -e $file;
16407     push @input_files,  $file;
16408 }
16409
16410 if ( $verbosity >= $VERBOSE ) {
16411     print "Expecting ".scalar( @input_files )." input files. ",
16412          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
16413 }
16414
16415 # We set $most_recent to be the most recently changed input file, including
16416 # this program itself (done much earlier in this file)
16417 foreach my $in (@input_files) {
16418     next unless -e $in;        # Keep going even if missing a file
16419     my $mod_time = (stat $in)[9];
16420     $most_recent = $mod_time if $mod_time > $most_recent;
16421
16422     # See that the input files have distinct names, to warn someone if they
16423     # are adding a new one
16424     if ($make_list) {
16425         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
16426         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
16427         my @directories = File::Spec->splitdir($directories);
16428         my $base = $file =~ s/\.txt$//;
16429         construct_filename($file, 'mutable', \@directories);
16430     }
16431 }
16432
16433 my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
16434               || ! scalar @mktables_list_output_files  # or if no outputs known
16435               || $old_start_time < $most_recent;       # or out-of-date
16436
16437 # Now we check to see if any output files are older than youngest, if
16438 # they are, we need to continue on, otherwise we can presumably bail.
16439 if (! $rebuild) {
16440     foreach my $out (@mktables_list_output_files) {
16441         if ( ! file_exists($out)) {
16442             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
16443             $rebuild = 1;
16444             last;
16445          }
16446         #local $to_trace = 1 if main::DEBUG;
16447         trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
16448         if ( (stat $out)[9] <= $most_recent ) {
16449             #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
16450             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
16451             $rebuild = 1;
16452             last;
16453         }
16454     }
16455 }
16456 if (! $rebuild) {
16457     print "Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
16458     exit(0);
16459 }
16460 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
16461
16462 # Ready to do the major processing.  First create the perl pseudo-property.
16463 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
16464
16465 # Process each input file
16466 foreach my $file (@input_file_objects) {
16467     $file->run;
16468 }
16469
16470 # Finish the table generation.
16471
16472 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
16473 finish_Unicode();
16474
16475 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
16476 compile_perl();
16477
16478 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
16479 add_perl_synonyms();
16480
16481 print "Writing tables\n" if $verbosity >= $PROGRESS;
16482 write_all_tables();
16483
16484 # Write mktables.lst
16485 if ( $file_list and $make_list ) {
16486
16487     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
16488     foreach my $file (@input_files, @files_actually_output) {
16489         my (undef, $directories, $file) = File::Spec->splitpath($file);
16490         my @directories = File::Spec->splitdir($directories);
16491         $file = join '/', @directories, $file;
16492     }
16493
16494     my $ofh;
16495     if (! open $ofh,">",$file_list) {
16496         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
16497         return
16498     }
16499     else {
16500         my $localtime = localtime $start_time;
16501         print $ofh <<"END";
16502 #
16503 # $file_list -- File list for $0.
16504 #
16505 #   Autogenerated starting on $start_time ($localtime)
16506 #
16507 # - First section is input files
16508 #   ($0 itself is not listed but is automatically considered an input)
16509 # - Section separator is /^=+\$/
16510 # - Second section is a list of output files.
16511 # - Lines matching /^\\s*#/ are treated as comments
16512 #   which along with blank lines are ignored.
16513 #
16514
16515 # Input files:
16516
16517 END
16518         print $ofh "$_\n" for sort(@input_files);
16519         print $ofh "\n=================================\n# Output files:\n\n";
16520         print $ofh "$_\n" for sort @files_actually_output;
16521         print $ofh "\n# ",scalar(@input_files)," input files\n",
16522                 "# ",scalar(@files_actually_output)+1," output files\n\n",
16523                 "# End list\n";
16524         close $ofh
16525             or Carp::my_carp("Failed to close $ofh: $!");
16526
16527         print "Filelist has ",scalar(@input_files)," input files and ",
16528             scalar(@files_actually_output)+1," output files\n"
16529             if $verbosity >= $VERBOSE;
16530     }
16531 }
16532
16533 # Output these warnings unless -q explicitly specified.
16534 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
16535     if (@unhandled_properties) {
16536         print "\nProperties and tables that unexpectedly have no code points\n";
16537         foreach my $property (sort @unhandled_properties) {
16538             print $property, "\n";
16539         }
16540     }
16541
16542     if (%potential_files) {
16543         print "\nInput files that are not considered:\n";
16544         foreach my $file (sort keys %potential_files) {
16545             print File::Spec->abs2rel($file), "\n";
16546         }
16547     }
16548     print "\nAll done\n" if $verbosity >= $VERBOSE;
16549 }
16550 exit(0);
16551
16552 # TRAILING CODE IS USED BY make_property_test_script()
16553 __DATA__
16554
16555 use strict;
16556 use warnings;
16557
16558 # If run outside the normal test suite on an ASCII platform, you can
16559 # just create a latin1_to_native() function that just returns its
16560 # inputs, because that's the only function used from test.pl
16561 require "test.pl";
16562
16563 # Test qr/\X/ and the \p{} regular expression constructs.  This file is
16564 # constructed by mktables from the tables it generates, so if mktables is
16565 # buggy, this won't necessarily catch those bugs.  Tests are generated for all
16566 # feasible properties; a few aren't currently feasible; see
16567 # is_code_point_usable() in mktables for details.
16568
16569 # Standard test packages are not used because this manipulates SIG_WARN.  It
16570 # exits 0 if every non-skipped test succeeded; -1 if any failed.
16571
16572 my $Tests = 0;
16573 my $Fails = 0;
16574
16575 sub Expect($$$$) {
16576     my $expected = shift;
16577     my $ord = shift;
16578     my $regex  = shift;
16579     my $warning_type = shift;   # Type of warning message, like 'deprecated'
16580                                 # or empty if none
16581     my $line   = (caller)[2];
16582     $ord = ord(latin1_to_native(chr($ord)));
16583
16584     # Convert the code point to hex form
16585     my $string = sprintf "\"\\x{%04X}\"", $ord;
16586
16587     my @tests = "";
16588
16589     # The first time through, use all warnings.  If the input should generate
16590     # a warning, add another time through with them turned off
16591     push @tests, "no warnings '$warning_type';" if $warning_type;
16592
16593     foreach my $no_warnings (@tests) {
16594
16595         # Store any warning messages instead of outputting them
16596         local $SIG{__WARN__} = $SIG{__WARN__};
16597         my $warning_message;
16598         $SIG{__WARN__} = sub { $warning_message = $_[0] };
16599
16600         $Tests++;
16601
16602         # A string eval is needed because of the 'no warnings'.
16603         # Assumes no parens in the regular expression
16604         my $result = eval "$no_warnings
16605                             my \$RegObj = qr($regex);
16606                             $string =~ \$RegObj ? 1 : 0";
16607         if (not defined $result) {
16608             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
16609             $Fails++;
16610         }
16611         elsif ($result ^ $expected) {
16612             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
16613             $Fails++;
16614         }
16615         elsif ($warning_message) {
16616             if (! $warning_type || ($warning_type && $no_warnings)) {
16617                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
16618                 $Fails++;
16619             }
16620             else {
16621                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
16622             }
16623         }
16624         elsif ($warning_type && ! $no_warnings) {
16625             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
16626             $Fails++;
16627         }
16628         else {
16629             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
16630         }
16631     }
16632     return;
16633 }
16634
16635 sub Error($) {
16636     my $regex  = shift;
16637     $Tests++;
16638     if (eval { 'x' =~ qr/$regex/; 1 }) {
16639         $Fails++;
16640         my $line = (caller)[2];
16641         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
16642     }
16643     else {
16644         my $line = (caller)[2];
16645         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
16646     }
16647     return;
16648 }
16649
16650 # GCBTest.txt character that separates grapheme clusters
16651 my $breakable_utf8 = my $breakable = chr(0xF7);
16652 utf8::upgrade($breakable_utf8);
16653
16654 # GCBTest.txt character that indicates that the adjoining code points are part
16655 # of the same grapheme cluster
16656 my $nobreak_utf8 = my $nobreak = chr(0xD7);
16657 utf8::upgrade($nobreak_utf8);
16658
16659 sub Test_X($) {
16660     # Test qr/\X/ matches.  The input is a line from auxiliary/GCBTest.txt
16661     # Each such line is a sequence of code points given by their hex numbers,
16662     # separated by the two characters defined just before this subroutine that
16663     # indicate that either there can or cannot be a break between the adjacent
16664     # code points.  If there isn't a break, that means the sequence forms an
16665     # extended grapheme cluster, which means that \X should match the whole
16666     # thing.  If there is a break, \X should stop there.  This is all
16667     # converted by this routine into a match:
16668     #   $string =~ /(\X)/,
16669     # Each \X should match the next cluster; and that is what is checked.
16670
16671     my $template = shift;
16672
16673     my $line   = (caller)[2];
16674
16675     # The line contains characters above the ASCII range, but in Latin1.  It
16676     # may or may not be in utf8, and if it is, it may or may not know it.  So,
16677     # convert these characters to 8 bits.  If knows is in utf8, simply
16678     # downgrade.
16679     if (utf8::is_utf8($template)) {
16680         utf8::downgrade($template);
16681     } else {
16682
16683         # Otherwise, if it is in utf8, but doesn't know it, the next lines
16684         # convert the two problematic characters to their 8-bit equivalents.
16685         # If it isn't in utf8, they don't harm anything.
16686         use bytes;
16687         $template =~ s/$nobreak_utf8/$nobreak/g;
16688         $template =~ s/$breakable_utf8/$breakable/g;
16689     }
16690
16691     # Get rid of the leading and trailing breakables
16692     $template =~ s/^ \s* $breakable \s* //x;
16693     $template =~ s/ \s* $breakable \s* $ //x;
16694
16695     # And no-breaks become just a space.
16696     $template =~ s/ \s* $nobreak \s* / /xg;
16697
16698     # Split the input into segments that are breakable between them.
16699     my @segments = split /\s*$breakable\s*/, $template;
16700
16701     my $string = "";
16702     my $display_string = "";
16703     my @should_match;
16704     my @should_display;
16705
16706     # Convert the code point sequence in each segment into a Perl string of
16707     # characters
16708     foreach my $segment (@segments) {
16709         my @code_points = split /\s+/, $segment;
16710         my $this_string = "";
16711         my $this_display = "";
16712         foreach my $code_point (@code_points) {
16713             $this_string .= latin1_to_native(chr(hex $code_point));
16714             $this_display .= "\\x{$code_point}";
16715         }
16716
16717         # The next cluster should match the string in this segment.
16718         push @should_match, $this_string;
16719         push @should_display, $this_display;
16720         $string .= $this_string;
16721         $display_string .= $this_display;
16722     }
16723
16724     # If a string can be represented in both non-ut8 and utf8, test both cases
16725     UPGRADE:
16726     for my $to_upgrade (0 .. 1) {
16727
16728         if ($to_upgrade) {
16729
16730             # If already in utf8, would just be a repeat
16731             next UPGRADE if utf8::is_utf8($string);
16732
16733             utf8::upgrade($string);
16734         }
16735
16736         # Finally, do the \X match.
16737         my @matches = $string =~ /(\X)/g;
16738
16739         # Look through each matched cluster to verify that it matches what we
16740         # expect.
16741         my $min = (@matches < @should_match) ? @matches : @should_match;
16742         for my $i (0 .. $min - 1) {
16743             $Tests++;
16744             if ($matches[$i] eq $should_match[$i]) {
16745                 print "ok $Tests - ";
16746                 if ($i == 0) {
16747                     print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
16748                 } else {
16749                     print "And \\X #", $i + 1,
16750                 }
16751                 print " correctly matched $should_display[$i]; line $line\n";
16752             } else {
16753                 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
16754                                                     unpack("U*", $matches[$i]));
16755                 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
16756                     $i + 1,
16757                     " should have matched $should_display[$i]",
16758                     " but instead matched $matches[$i]",
16759                     ".  Abandoning rest of line $line\n";
16760                 next UPGRADE;
16761             }
16762         }
16763
16764         # And the number of matches should equal the number of expected matches.
16765         $Tests++;
16766         if (@matches == @should_match) {
16767             print "ok $Tests - Nothing was left over; line $line\n";
16768         } else {
16769             print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
16770         }
16771     }
16772
16773     return;
16774 }
16775
16776 sub Finished() {
16777     print "1..$Tests\n";
16778     exit($Fails ? -1 : 0);
16779 }
16780
16781 Error('\p{Script=InGreek}');    # Bug #69018
16782 Test_X("1100 $nobreak 1161");  # Bug #70940
16783 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
16784 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
16785 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726