lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # Needs 'no overloading' to run faster on miniperl.  Code commented out at the
   8 # subroutine objaddr can be used instead to work as far back (untested) as
   9 # 5.8: needs pack "U".  But almost all occurrences of objaddr have been
  10 # removed in favor of using 'no overloading'.  You also would have to go
  11 # through and replace occurrences like:
  12 #       my $addr = do { no overloading; pack 'J', $self; }
  13 # with
  14 #       my $addr = main::objaddr $self;
  15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
  16 # that instituted the change to main::objaddr, and subsequent commits that
  17 # changed 0+$self to pack 'J', $self.)
  18
  19 my $start_time;
  20 BEGIN { # Get the time the script started running; do it at compilation to
  21         # get it as close as possible
  22     $start_time= time;
  23 }
  24
  25 require 5.010_001;
  26 use strict;
  27 use warnings;
  28 use Carp;
  29 use Config;
  30 use File::Find;
  31 use File::Path;
  32 use File::Spec;
  33 use Text::Tabs;
  34 use re "/aa";
  35
  36 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  38
  39 ##########################################################################
  40 #
  41 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  42 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  43 # a pod file and .t files, depending on option parameters.
  44 #
  45 # The structure of this file is:
  46 #   First these introductory comments; then
  47 #   code needed for everywhere, such as debugging stuff; then
  48 #   code to handle input parameters; then
  49 #   data structures likely to be of external interest (some of which depend on
  50 #       the input parameters, so follows them; then
  51 #   more data structures and subroutine and package (class) definitions; then
  52 #   the small actual loop to process the input files and finish up; then
  53 #   a __DATA__ section, for the .t tests
  54 #
  55 # This program works on all releases of Unicode so far.  The outputs have been
  56 # scrutinized most intently for release 5.1.  The others have been checked for
  57 # somewhat more than just sanity.  It can handle all non-provisional Unicode
  58 # character properties in those releases.
  59 #
  60 # This program is mostly about Unicode character (or code point) properties.
  61 # A property describes some attribute or quality of a code point, like if it
  62 # is lowercase or not, its name, what version of Unicode it was first defined
  63 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  64 # possibilities by making all properties into mappings from each code point
  65 # into some corresponding value.  In the case of it being lowercase or not,
  66 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  67 # property maps each Unicode code point to a single value, called a "property
  68 # value".  (Some more recently defined properties, map a code point to a set
  69 # of values.)
  70 #
  71 # When using a property in a regular expression, what is desired isn't the
  72 # mapping of the code point to its property's value, but the reverse (or the
  73 # mathematical "inverse relation"): starting with the property value, "Does a
  74 # code point map to it?"  These are written in a "compound" form:
  75 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  76 # files containing the lists of code points that map to each such regular
  77 # expression property value, one file per list
  78 #
  79 # There is also a single form shortcut that Perl adds for many of the commonly
  80 # used properties.  This happens for all binary properties, plus script,
  81 # general_category, and block properties.
  82 #
  83 # Thus the outputs of this program are files.  There are map files, mostly in
  84 # the 'To' directory; and there are list files for use in regular expression
  85 # matching, all in subdirectories of the 'lib' directory, with each
  86 # subdirectory being named for the property that the lists in it are for.
  87 # Bookkeeping, test, and documentation files are also generated.
  88
  89 my $matches_directory = 'lib';   # Where match (\p{}) files go.
  90 my $map_directory = 'To';        # Where map files go.
  91
  92 # DATA STRUCTURES
  93 #
  94 # The major data structures of this program are Property, of course, but also
  95 # Table.  There are two kinds of tables, very similar to each other.
  96 # "Match_Table" is the data structure giving the list of code points that have
  97 # a particular property value, mentioned above.  There is also a "Map_Table"
  98 # data structure which gives the property's mapping from code point to value.
  99 # There are two structures because the match tables need to be combined in
 100 # various ways, such as constructing unions, intersections, complements, etc.,
 101 # and the map ones don't.  And there would be problems, perhaps subtle, if
 102 # a map table were inadvertently operated on in some of those ways.
 103 # The use of separate classes with operations defined on one but not the other
 104 # prevents accidentally confusing the two.
 105 #
 106 # At the heart of each table's data structure is a "Range_List", which is just
 107 # an ordered list of "Ranges", plus ancillary information, and methods to
 108 # operate on them.  A Range is a compact way to store property information.
 109 # Each range has a starting code point, an ending code point, and a value that
 110 # is meant to apply to all the code points between the two end points,
 111 # inclusive.  For a map table, this value is the property value for those
 112 # code points.  Two such ranges could be written like this:
 113 #   0x41 .. 0x5A, 'Upper',
 114 #   0x61 .. 0x7A, 'Lower'
 115 #
 116 # Each range also has a type used as a convenience to classify the values.
 117 # Most ranges in this program will be Type 0, or normal, but there are some
 118 # ranges that have a non-zero type.  These are used only in map tables, and
 119 # are for mappings that don't fit into the normal scheme of things.  Mappings
 120 # that require a hash entry to communicate with utf8.c are one example;
 121 # another example is mappings for charnames.pm to use which indicate a name
 122 # that is algorithmically determinable from its code point (and the reverse).
 123 # These are used to significantly compact these tables, instead of listing
 124 # each one of the tens of thousands individually.
 125 #
 126 # In a match table, the value of a range is irrelevant (and hence the type as
 127 # well, which will always be 0), and arbitrarily set to the null string.
 128 # Using the example above, there would be two match tables for those two
 129 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 130 # named Lower would contain 0x61..0x7A.
 131 #
 132 # Actually, there are two types of range lists, "Range_Map" is the one
 133 # associated with map tables, and "Range_List" with match tables.
 134 # Again, this is so that methods can be defined on one and not the others so
 135 # as to prevent operating on them in incorrect ways.
 136 #
 137 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 138 # in the perl core.  All tables could in theory be written, but some are
 139 # suppressed because there is no current practical use for them.  It is easy
 140 # to change which get written by changing various lists that are near the top
 141 # of the actual code in this file.  The table data structures contain enough
 142 # ancillary information to allow them to be treated as separate entities for
 143 # writing, such as the path to each one's file.  There is a heading in each
 144 # map table that gives the format of its entries, and what the map is for all
 145 # the code points missing from it.  (This allows tables to be more compact.)
 146 #
 147 # The Property data structure contains one or more tables.  All properties
 148 # contain a map table (except the $perl property which is a
 149 # pseudo-property containing only match tables), and any properties that
 150 # are usable in regular expression matches also contain various matching
 151 # tables, one for each value the property can have.  A binary property can
 152 # have two values, True and False (or Y and N, which are preferred by Unicode
 153 # terminology).  Thus each of these properties will have a map table that
 154 # takes every code point and maps it to Y or N (but having ranges cuts the
 155 # number of entries in that table way down), and two match tables, one
 156 # which has a list of all the code points that map to Y, and one for all the
 157 # code points that map to N.  (For each binary property, a third table is also
 158 # generated for the pseudo Perl property.  It contains the identical code
 159 # points as the Y table, but can be written in regular expressions, not in the
 160 # compound form, but in a "single" form like \p{IsUppercase}.)  Many
 161 # properties are binary, but some properties have several possible values,
 162 # some have many, and properties like Name have a different value for every
 163 # named code point.  Those will not, unless the controlling lists are changed,
 164 # have their match tables written out.  But all the ones which can be used in
 165 # regular expression \p{} and \P{} constructs will.  Prior to 5.14, generally
 166 # a property would have either its map table or its match tables written but
 167 # not both.  Again, what gets written is controlled by lists which can easily
 168 # be changed.  Starting in 5.14, advantage was taken of this, and all the map
 169 # tables needed to reconstruct the Unicode db are now written out, while
 170 # suppressing the Unicode .txt files that contain the data.  Our tables are
 171 # much more compact than the .txt files, so a significant space savings was
 172 # achieved.  Also, tables are not written out that are trivially derivable
 173 # from tables that do get written.  So, there typically is no file containing
 174 # the code points not matched by a binary property (the table for \P{} versus
 175 # lowercase \p{}), since you just need to invert the True table to get the
 176 # False table.
 177
 178 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
 179 # how many match tables there are and the content of the maps.  This 'Type' is
 180 # different than a range 'Type', so don't get confused by the two concepts
 181 # having the same name.
 182 #
 183 # For information about the Unicode properties, see Unicode's UAX44 document:
 184
 185 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 186
 187 # As stated earlier, this program will work on any release of Unicode so far.
 188 # Most obvious problems in earlier data have NOT been corrected except when
 189 # necessary to make Perl or this program work reasonably, and to keep out
 190 # potential security issues.  For example, no folding information was given in
 191 # early releases, so this program substitutes lower case instead, just so that
 192 # a regular expression with the /i option will do something that actually
 193 # gives the right results in many cases.  There are also a couple other
 194 # corrections for version 1.1.5, commented at the point they are made.  As an
 195 # example of corrections that weren't made (but could be) is this statement
 196 # from DerivedAge.txt: "The supplementary private use code points and the
 197 # non-character code points were assigned in version 2.0, but not specifically
 198 # listed in the UCD until versions 3.0 and 3.1 respectively."  (To be precise
 199 # it was 3.0.1 not 3.0.0)  More information on Unicode version glitches is
 200 # further down in these introductory comments.
 201 #
 202 # This program works on all non-provisional properties as of the current
 203 # Unicode release, though the files for some are suppressed for various
 204 # reasons.  You can change which are output by changing lists in this program.
 205 #
 206 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
 207 # loose matchings rules (from Unicode TR18):
 208 #
 209 #    The recommended names for UCD properties and property values are in
 210 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 211 #    [PropValue]. There are both abbreviated names and longer, more
 212 #    descriptive names. It is strongly recommended that both names be
 213 #    recognized, and that loose matching of property names be used,
 214 #    whereby the case distinctions, whitespace, hyphens, and underbar
 215 #    are ignored.
 216 #
 217 # The program still allows Fuzzy to override its determination of if loose
 218 # matching should be used, but it isn't currently used, as it is no longer
 219 # needed; the calculations it makes are good enough.
 220 #
 221 # SUMMARY OF HOW IT WORKS:
 222 #
 223 #   Process arguments
 224 #
 225 #   A list is constructed containing each input file that is to be processed
 226 #
 227 #   Each file on the list is processed in a loop, using the associated handler
 228 #   code for each:
 229 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 230 #            first.  These files name the properties and property values.
 231 #            Objects are created of all the property and property value names
 232 #            that the rest of the input should expect, including all synonyms.
 233 #        The other input files give mappings from properties to property
 234 #           values.  That is, they list code points and say what the mapping
 235 #           is under the given property.  Some files give the mappings for
 236 #           just one property; and some for many.  This program goes through
 237 #           each file and populates the properties and their map tables from
 238 #           them.  Some properties are listed in more than one file, and
 239 #           Unicode has set up a precedence as to which has priority if there
 240 #           is a conflict.  Thus the order of processing matters, and this
 241 #           program handles the conflict possibility by processing the
 242 #           overriding input files last, so that if necessary they replace
 243 #           earlier values.
 244 #        After this is all done, the program creates the property mappings not
 245 #            furnished by Unicode, but derivable from what it does give.
 246 #        The tables of code points that match each property value in each
 247 #            property that is accessible by regular expressions are created.
 248 #        The Perl-defined properties are created and populated.  Many of these
 249 #            require data determined from the earlier steps
 250 #        Any Perl-defined synonyms are created, and name clashes between Perl
 251 #            and Unicode are reconciled and warned about.
 252 #        All the properties are written to files
 253 #        Any other files are written, and final warnings issued.
 254 #
 255 # For clarity, a number of operators have been overloaded to work on tables:
 256 #   ~ means invert (take all characters not in the set).  The more
 257 #       conventional '!' is not used because of the possibility of confusing
 258 #       it with the actual boolean operation.
 259 #   + means union
 260 #   - means subtraction
 261 #   & means intersection
 262 # The precedence of these is the order listed.  Parentheses should be
 263 # copiously used.  These are not a general scheme.  The operations aren't
 264 # defined for a number of things, deliberately, to avoid getting into trouble.
 265 # Operations are done on references and affect the underlying structures, so
 266 # that the copy constructors for them have been overloaded to not return a new
 267 # clone, but the input object itself.
 268 #
 269 # The bool operator is deliberately not overloaded to avoid confusion with
 270 # "should it mean if the object merely exists, or also is non-empty?".
 271 #
 272 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 273 #
 274 # This program needs to be able to run under miniperl.  Therefore, it uses a
 275 # minimum of other modules, and hence implements some things itself that could
 276 # be gotten from CPAN
 277 #
 278 # This program uses inputs published by the Unicode Consortium.  These can
 279 # change incompatibly between releases without the Perl maintainers realizing
 280 # it.  Therefore this program is now designed to try to flag these.  It looks
 281 # at the directories where the inputs are, and flags any unrecognized files.
 282 # It keeps track of all the properties in the files it handles, and flags any
 283 # that it doesn't know how to handle.  It also flags any input lines that
 284 # don't match the expected syntax, among other checks.
 285 #
 286 # It is also designed so if a new input file matches one of the known
 287 # templates, one hopefully just needs to add it to a list to have it
 288 # processed.
 289 #
 290 # As mentioned earlier, some properties are given in more than one file.  In
 291 # particular, the files in the extracted directory are supposedly just
 292 # reformattings of the others.  But they contain information not easily
 293 # derivable from the other files, including results for Unihan, which this
 294 # program doesn't ordinarily look at, and for unassigned code points.  They
 295 # also have historically had errors or been incomplete.  In an attempt to
 296 # create the best possible data, this program thus processes them first to
 297 # glean information missing from the other files; then processes those other
 298 # files to override any errors in the extracted ones.  Much of the design was
 299 # driven by this need to store things and then possibly override them.
 300 #
 301 # It tries to keep fatal errors to a minimum, to generate something usable for
 302 # testing purposes.  It always looks for files that could be inputs, and will
 303 # warn about any that it doesn't know how to handle (the -q option suppresses
 304 # the warning).
 305 #
 306 # Why is there more than one type of range?
 307 #   This simplified things.  There are some very specialized code points that
 308 #   have to be handled specially for output, such as Hangul syllable names.
 309 #   By creating a range type (done late in the development process), it
 310 #   allowed this to be stored with the range, and overridden by other input.
 311 #   Originally these were stored in another data structure, and it became a
 312 #   mess trying to decide if a second file that was for the same property was
 313 #   overriding the earlier one or not.
 314 #
 315 # Why are there two kinds of tables, match and map?
 316 #   (And there is a base class shared by the two as well.)  As stated above,
 317 #   they actually are for different things.  Development proceeded much more
 318 #   smoothly when I (khw) realized the distinction.  Map tables are used to
 319 #   give the property value for every code point (actually every code point
 320 #   that doesn't map to a default value).  Match tables are used for regular
 321 #   expression matches, and are essentially the inverse mapping.  Separating
 322 #   the two allows more specialized methods, and error checks so that one
 323 #   can't just take the intersection of two map tables, for example, as that
 324 #   is nonsensical.
 325 #
 326 # What about 'fate' and 'status'.  The concept of a table's fate was created
 327 #   late when it became clear that something more was needed.  The difference
 328 #   between this and 'status' is unclean, and could be improved if someone
 329 #   wanted to spend the effort.
 330 #
 331 # DEBUGGING
 332 #
 333 # This program is written so it will run under miniperl.  Occasionally changes
 334 # will cause an error where the backtrace doesn't work well under miniperl.
 335 # To diagnose the problem, you can instead run it under regular perl, if you
 336 # have one compiled.
 337 #
 338 # There is a good trace facility.  To enable it, first sub DEBUG must be set
 339 # to return true.  Then a line like
 340 #
 341 # local $to_trace = 1 if main::DEBUG;
 342 #
 343 # can be added to enable tracing in its lexical scope (plus dynamic) or until
 344 # you insert another line:
 345 #
 346 # local $to_trace = 0 if main::DEBUG;
 347 #
 348 # To actually trace, use a line like "trace $a, @b, %c, ...;
 349 #
 350 # Some of the more complex subroutines already have trace statements in them.
 351 # Permanent trace statements should be like:
 352 #
 353 # trace ... if main::DEBUG && $to_trace;
 354 #
 355 # If there is just one or a few files that you're debugging, you can easily
 356 # cause most everything else to be skipped.  Change the line
 357 #
 358 # my $debug_skip = 0;
 359 #
 360 # to 1, and every file whose object is in @input_file_objects and doesn't have
 361 # a, 'non_skip => 1,' in its constructor will be skipped.  However, skipping
 362 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
 363 #
 364 # To compare the output tables, it may be useful to specify the -annotate
 365 # flag.  This causes the tables to expand so there is one entry for each
 366 # non-algorithmically named code point giving, currently its name, and its
 367 # graphic representation if printable (and you have a font that knows about
 368 # it).  This makes it easier to see what the particular code points are in
 369 # each output table.  The tables are usable, but because they don't have
 370 # ranges (for the most part), a Perl using them will run slower.  Non-named
 371 # code points are annotated with a description of their status, and contiguous
 372 # ones with the same description will be output as a range rather than
 373 # individually.  Algorithmically named characters are also output as ranges,
 374 # except when there are just a few contiguous ones.
 375 #
 376 # FUTURE ISSUES
 377 #
 378 # The program would break if Unicode were to change its names so that
 379 # interior white space, underscores, or dashes differences were significant
 380 # within property and property value names.
 381 #
 382 # It might be easier to use the xml versions of the UCD if this program ever
 383 # would need heavy revision, and the ability to handle old versions was not
 384 # required.
 385 #
 386 # There is the potential for name collisions, in that Perl has chosen names
 387 # that Unicode could decide it also likes.  There have been such collisions in
 388 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 389 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 390 # collisions, which were withdrawn before the final release, because of Perl's
 391 # and other's protests.  These all involved new properties which began with
 392 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 393 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 394 # Unicode document, so they are unlikely to be used by Unicode for another
 395 # purpose.  However, they might try something beginning with 'In', or use any
 396 # of the other Perl-defined properties.  This program will warn you of name
 397 # collisions, and refuse to generate tables with them, but manual intervention
 398 # will be required in this event.  One scheme that could be implemented, if
 399 # necessary, would be to have this program generate another file, or add a
 400 # field to mktables.lst that gives the date of first definition of a property.
 401 # Each new release of Unicode would use that file as a basis for the next
 402 # iteration.  And the Perl synonym addition code could sort based on the age
 403 # of the property, so older properties get priority, and newer ones that clash
 404 # would be refused; hence existing code would not be impacted, and some other
 405 # synonym would have to be used for the new property.  This is ugly, and
 406 # manual intervention would certainly be easier to do in the short run; lets
 407 # hope it never comes to this.
 408 #
 409 # A NOTE ON UNIHAN
 410 #
 411 # This program can generate tables from the Unihan database.  But it doesn't
 412 # by default, letting the CPAN module Unicode::Unihan handle them.  Prior to
 413 # version 5.2, this database was in a single file, Unihan.txt.  In 5.2 the
 414 # database was split into 8 different files, all beginning with the letters
 415 # 'Unihan'.  This program will read those file(s) if present, but it needs to
 416 # know which of the many properties in the file(s) should have tables created
 417 # for them.  It will create tables for any properties listed in
 418 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
 419 # @cjk_properties array and the @cjk_property_values array.  Thus, if a
 420 # property you want is not in those files of the release you are building
 421 # against, you must add it to those two arrays.  Starting in 4.0, the
 422 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
 423 # is present in the directory, a table will be generated for that property.
 424 # In 5.2, several more properties were added.  For your convenience, the two
 425 # arrays are initialized with all the 6.0 listed properties that are also in
 426 # earlier releases.  But these are commented out.  You can just uncomment the
 427 # ones you want, or use them as a template for adding entries for other
 428 # properties.
 429 #
 430 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 431 # and filter_unihan_line() are the functions where this is done.  This program
 432 # already does some adjusting to make the lines look more like the rest of the
 433 # Unicode DB;  You can see what that is in filter_unihan_line()
 434 #
 435 # There is a bug in the 3.2 data file in which some values for the
 436 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 437 # could be added for these; or for a particular installation, the Unihan.txt
 438 # file could be edited to fix them.
 439 #
 440 # HOW TO ADD A FILE TO BE PROCESSED
 441 #
 442 # A new file from Unicode needs to have an object constructed for it in
 443 # @input_file_objects, probably at the end or at the end of the extracted
 444 # ones.  The program should warn you if its name will clash with others on
 445 # restrictive file systems, like DOS.  If so, figure out a better name, and
 446 # add lines to the README.perl file giving that.  If the file is a character
 447 # property, it should be in the format that Unicode has implicitly
 448 # standardized for such files for the more recently introduced ones.
 449 # If so, the Input_file constructor for @input_file_objects can just be the
 450 # file name and release it first appeared in.  If not, then it should be
 451 # possible to construct an each_line_handler() to massage the line into the
 452 # standardized form.
 453 #
 454 # For non-character properties, more code will be needed.  You can look at
 455 # the existing entries for clues.
 456 #
 457 # UNICODE VERSIONS NOTES
 458 #
 459 # The Unicode UCD has had a number of errors in it over the versions.  And
 460 # these remain, by policy, in the standard for that version.  Therefore it is
 461 # risky to correct them, because code may be expecting the error.  So this
 462 # program doesn't generally make changes, unless the error breaks the Perl
 463 # core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
 464 # for U+1105, which causes real problems for the algorithms for Jamo
 465 # calculations, so it is changed here.
 466 #
 467 # But it isn't so clear cut as to what to do about concepts that are
 468 # introduced in a later release; should they extend back to earlier releases
 469 # where the concept just didn't exist?  It was easier to do this than to not,
 470 # so that's what was done.  For example, the default value for code points not
 471 # in the files for various properties was probably undefined until changed by
 472 # some version.  No_Block for blocks is such an example.  This program will
 473 # assign No_Block even in Unicode versions that didn't have it.  This has the
 474 # benefit that code being written doesn't have to special case earlier
 475 # versions; and the detriment that it doesn't match the Standard precisely for
 476 # the affected versions.
 477 #
 478 # Here are some observations about some of the issues in early versions:
 479 #
 480 # Prior to version 3.0, there were 3 character decompositions.  These are not
 481 # handled by Unicode::Normalize, nor will it compile when presented a version
 482 # that has them.  However, you can trivially get it to compile by simply
 483 # ignoring those decompositions, by changing the croak to a carp.  At the time
 484 # of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
 485 #
 486 #   croak("Weird Canonical Decomposition of U+$h");
 487 #
 488 # Simply change to a carp.  It will compile, but will not know about any three
 489 # character decomposition.
 490
 491 # The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
 492 # that the reason is that the CJK block starting at 4E00 was removed from
 493 # PropList, and was not put back in until 3.1.0.  The Perl extension (the
 494 # single property name \p{alpha}) has the correct values.  But the compound
 495 # form is simply not generated until 3.1, as it can be argued that prior to
 496 # this release, this was not an official property.  The comments for
 497 # filter_old_style_proplist() give more details.
 498 #
 499 # Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
 500 # always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
 501 # reason is that 3.2 introduced U+205F=medium math space, which was not
 502 # classed as white space, but Perl figured out that it should have been. 4.0
 503 # reclassified it correctly.
 504 #
 505 # Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
 506 # this was erroneously a synonym for 202 (it should be 200).  In 4.0, ATB
 507 # became 202, and ATBL was left with no code points, as all the ones that
 508 # mapped to 202 stayed mapped to 202.  Thus if your program used the numeric
 509 # name for the class, it would not have been affected, but if it used the
 510 # mnemonic, it would have been.
 511 #
 512 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that code
 513 # points which eventually came to have this script property value, instead
 514 # mapped to "Unknown".  But in the next release all these code points were
 515 # moved to \p{sc=common} instead.
 516 #
 517 # The default for missing code points for BidiClass is complicated.  Starting
 518 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 519 # tries to do the best it can for earlier releases.  It is done in
 520 # process_PropertyAliases()
 521 #
 522 # In version 2.1.2, the entry in UnicodeData.txt:
 523 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
 524 # should instead be
 525 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
 526 # Without this change, there are casing problems for this character.
 527 #
 528 ##############################################################################
 529
 530 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 531                         # and errors
 532 my $MAX_LINE_WIDTH = 78;
 533
 534 # Debugging aid to skip most files so as to not be distracted by them when
 535 # concentrating on the ones being debugged.  Add
 536 # non_skip => 1,
 537 # to the constructor for those files you want processed when you set this.
 538 # Files with a first version number of 0 are special: they are always
 539 # processed regardless of the state of this flag.  Generally, Jamo.txt and
 540 # UnicodeData.txt must not be skipped if you want this program to not die
 541 # before normal completion.
 542 my $debug_skip = 0;
 543
 544
 545 # Normally these are suppressed.
 546 my $write_Unicode_deprecated_tables = 0;
 547
 548 # Set to 1 to enable tracing.
 549 our $to_trace = 0;
 550
 551 { # Closure for trace: debugging aid
 552     my $print_caller = 1;        # ? Include calling subroutine name
 553     my $main_with_colon = 'main::';
 554     my $main_colon_length = length($main_with_colon);
 555
 556     sub trace {
 557         return unless $to_trace;        # Do nothing if global flag not set
 558
 559         my @input = @_;
 560
 561         local $DB::trace = 0;
 562         $DB::trace = 0;          # Quiet 'used only once' message
 563
 564         my $line_number;
 565
 566         # Loop looking up the stack to get the first non-trace caller
 567         my $caller_line;
 568         my $caller_name;
 569         my $i = 0;
 570         do {
 571             $line_number = $caller_line;
 572             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 573             $caller = $main_with_colon unless defined $caller;
 574
 575             $caller_name = $caller;
 576
 577             # get rid of pkg
 578             $caller_name =~ s/.*:://;
 579             if (substr($caller_name, 0, $main_colon_length)
 580                 eq $main_with_colon)
 581             {
 582                 $caller_name = substr($caller_name, $main_colon_length);
 583             }
 584
 585         } until ($caller_name ne 'trace');
 586
 587         # If the stack was empty, we were called from the top level
 588         $caller_name = 'main' if ($caller_name eq ""
 589                                     || $caller_name eq 'trace');
 590
 591         my $output = "";
 592         foreach my $string (@input) {
 593             #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 594             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 595                 $output .= simple_dumper($string);
 596             }
 597             else {
 598                 $string = "$string" if ref $string;
 599                 $string = $UNDEF unless defined $string;
 600                 chomp $string;
 601                 $string = '""' if $string eq "";
 602                 $output .= " " if $output ne ""
 603                                 && $string ne ""
 604                                 && substr($output, -1, 1) ne " "
 605                                 && substr($string, 0, 1) ne " ";
 606                 $output .= $string;
 607             }
 608         }
 609
 610         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 611         print STDERR "$caller_name: " if $print_caller;
 612         print STDERR $output, "\n";
 613         return;
 614     }
 615 }
 616
 617 # This is for a rarely used development feature that allows you to compare two
 618 # versions of the Unicode standard without having to deal with changes caused
 619 # by the code points introduced in the later version.  Change the 0 to a
 620 # string containing a SINGLE dotted Unicode release number (e.g. "2.1").  Only
 621 # code points introduced in that release and earlier will be used; later ones
 622 # are thrown away.  You use the version number of the earliest one you want to
 623 # compare; then run this program on directory structures containing each
 624 # release, and compare the outputs.  These outputs will therefore include only
 625 # the code points common to both releases, and you can see the changes caused
 626 # just by the underlying release semantic changes.  For versions earlier than
 627 # 3.2, you must copy a version of DAge.txt into the directory.
 628 my $string_compare_versions = DEBUG && 0; #  e.g., "2.1";
 629 my $compare_versions = DEBUG
 630                        && $string_compare_versions
 631                        && pack "C*", split /\./, $string_compare_versions;
 632
 633 sub uniques {
 634     # Returns non-duplicated input values.  From "Perl Best Practices:
 635     # Encapsulated Cleverness".  p. 455 in first edition.
 636
 637     my %seen;
 638     # Arguably this breaks encapsulation, if the goal is to permit multiple
 639     # distinct objects to stringify to the same value, and be interchangeable.
 640     # However, for this program, no two objects stringify identically, and all
 641     # lists passed to this function are either objects or strings. So this
 642     # doesn't affect correctness, but it does give a couple of percent speedup.
 643     no overloading;
 644     return grep { ! $seen{$_}++ } @_;
 645 }
 646
 647 $0 = File::Spec->canonpath($0);
 648
 649 my $make_test_script = 0;      # ? Should we output a test script
 650 my $make_norm_test_script = 0; # ? Should we output a normalization test script
 651 my $write_unchanged_files = 0; # ? Should we update the output files even if
 652                                #    we don't think they have changed
 653 my $use_directory = "";        # ? Should we chdir somewhere.
 654 my $pod_directory;             # input directory to store the pod file.
 655 my $pod_file = 'perluniprops';
 656 my $t_path;                     # Path to the .t test file
 657 my $file_list = 'mktables.lst'; # File to store input and output file names.
 658                                # This is used to speed up the build, by not
 659                                # executing the main body of the program if
 660                                # nothing on the list has changed since the
 661                                # previous build
 662 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 663                                # make a list so that when the pumpking is
 664                                # preparing a release, s/he won't have to do
 665                                # special things
 666 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 667                                # in the input.
 668 my $output_range_counts = $debugging_build;   # ? Should we include the number
 669                                               # of code points in ranges in
 670                                               # the output
 671 my $annotate = 0;              # ? Should character names be in the output
 672
 673 # Verbosity levels; 0 is quiet
 674 my $NORMAL_VERBOSITY = 1;
 675 my $PROGRESS = 2;
 676 my $VERBOSE = 3;
 677
 678 my $verbosity = $NORMAL_VERBOSITY;
 679
 680 # Process arguments
 681 while (@ARGV) {
 682     my $arg = shift @ARGV;
 683     if ($arg eq '-v') {
 684         $verbosity = $VERBOSE;
 685     }
 686     elsif ($arg eq '-p') {
 687         $verbosity = $PROGRESS;
 688         $| = 1;     # Flush buffers as we go.
 689     }
 690     elsif ($arg eq '-q') {
 691         $verbosity = 0;
 692     }
 693     elsif ($arg eq '-w') {
 694         $write_unchanged_files = 1; # update the files even if havent changed
 695     }
 696     elsif ($arg eq '-check') {
 697         my $this = shift @ARGV;
 698         my $ok = shift @ARGV;
 699         if ($this ne $ok) {
 700             print "Skipping as check params are not the same.\n";
 701             exit(0);
 702         }
 703     }
 704     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 705         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 706     }
 707     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 708     {
 709         $make_test_script = 1;
 710     }
 711     elsif ($arg eq '-makenormtest')
 712     {
 713         $make_norm_test_script = 1;
 714     }
 715     elsif ($arg eq '-makelist') {
 716         $make_list = 1;
 717     }
 718     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 719         -d $use_directory or croak "Unknown directory '$use_directory'";
 720     }
 721     elsif ($arg eq '-L') {
 722
 723         # Existence not tested until have chdir'd
 724         $file_list = shift;
 725     }
 726     elsif ($arg eq '-globlist') {
 727         $glob_list = 1;
 728     }
 729     elsif ($arg eq '-c') {
 730         $output_range_counts = ! $output_range_counts
 731     }
 732     elsif ($arg eq '-annotate') {
 733         $annotate = 1;
 734         $debugging_build = 1;
 735         $output_range_counts = 1;
 736     }
 737     else {
 738         my $with_c = 'with';
 739         $with_c .= 'out' if $output_range_counts;   # Complements the state
 740         croak <<END;
 741 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 742           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 743           [-check A B ]
 744   -c          : Output comments $with_c number of code points in ranges
 745   -q          : Quiet Mode: Only output serious warnings.
 746   -p          : Set verbosity level to normal plus show progress.
 747   -v          : Set Verbosity level high:  Show progress and non-serious
 748                 warnings
 749   -w          : Write files regardless
 750   -C dir      : Change to this directory before proceeding. All relative paths
 751                 except those specified by the -P and -T options will be done
 752                 with respect to this directory.
 753   -P dir      : Output $pod_file file to directory 'dir'.
 754   -T path     : Create a test script as 'path'; overrides -maketest
 755   -L filelist : Use alternate 'filelist' instead of standard one
 756   -globlist   : Take as input all non-Test *.txt files in current and sub
 757                 directories
 758   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 759                 overrides -T
 760   -makelist   : Rewrite the file list $file_list based on current setup
 761   -annotate   : Output an annotation for each character in the table files;
 762                 useful for debugging mktables, looking at diffs; but is slow,
 763                 memory intensive; resulting tables are usable but are slow and
 764                 very large (and currently fail the Unicode::UCD.t tests).
 765   -check A B  : Executes $0 only if A and B are the same
 766 END
 767     }
 768 }
 769
 770 # Stores the most-recently changed file.  If none have changed, can skip the
 771 # build
 772 my $most_recent = (stat $0)[9];   # Do this before the chdir!
 773
 774 # Change directories now, because need to read 'version' early.
 775 if ($use_directory) {
 776     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 777         $pod_directory = File::Spec->rel2abs($pod_directory);
 778     }
 779     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 780         $t_path = File::Spec->rel2abs($t_path);
 781     }
 782     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 783     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 784         $pod_directory = File::Spec->abs2rel($pod_directory);
 785     }
 786     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 787         $t_path = File::Spec->abs2rel($t_path);
 788     }
 789 }
 790
 791 # Get Unicode version into regular and v-string.  This is done now because
 792 # various tables below get populated based on it.  These tables are populated
 793 # here to be near the top of the file, and so easily seeable by those needing
 794 # to modify things.
 795 open my $VERSION, "<", "version"
 796                     or croak "$0: can't open required file 'version': $!\n";
 797 my $string_version = <$VERSION>;
 798 close $VERSION;
 799 chomp $string_version;
 800 my $v_version = pack "C*", split /\./, $string_version;        # v string
 801
 802 # The following are the complete names of properties with property values that
 803 # are known to not match any code points in some versions of Unicode, but that
 804 # may change in the future so they should be matchable, hence an empty file is
 805 # generated for them.
 806 my @tables_that_may_be_empty = (
 807                                 'Joining_Type=Left_Joining',
 808                                 );
 809 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 810 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 811 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 812                                                     if $v_version ge v4.1.0;
 813 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
 814                                                     if $v_version ge v6.0.0;
 815 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
 816                                                     if $v_version ge v6.1.0;
 817 push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
 818                                                     if $v_version ge v6.2.0;
 819
 820 # The lists below are hashes, so the key is the item in the list, and the
 821 # value is the reason why it is in the list.  This makes generation of
 822 # documentation easier.
 823
 824 my %why_suppressed;  # No file generated for these.
 825
 826 # Files aren't generated for empty extraneous properties.  This is arguable.
 827 # Extraneous properties generally come about because a property is no longer
 828 # used in a newer version of Unicode.  If we generated a file without code
 829 # points, programs that used to work on that property will still execute
 830 # without errors.  It just won't ever match (or will always match, with \P{}).
 831 # This means that the logic is now likely wrong.  I (khw) think its better to
 832 # find this out by getting an error message.  Just move them to the table
 833 # above to change this behavior
 834 my %why_suppress_if_empty_warn_if_not = (
 835
 836    # It is the only property that has ever officially been removed from the
 837    # Standard.  The database never contained any code points for it.
 838    'Special_Case_Condition' => 'Obsolete',
 839
 840    # Apparently never official, but there were code points in some versions of
 841    # old-style PropList.txt
 842    'Non_Break' => 'Obsolete',
 843 );
 844
 845 # These would normally go in the warn table just above, but they were changed
 846 # a long time before this program was written, so warnings about them are
 847 # moot.
 848 if ($v_version gt v3.2.0) {
 849     push @tables_that_may_be_empty,
 850                                 'Canonical_Combining_Class=Attached_Below_Left'
 851 }
 852
 853 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
 854 # unless explicitly added.
 855 if ($v_version ge v5.2.0) {
 856     my $unihan = 'Unihan; remove from list if using Unihan';
 857     foreach my $table (qw (
 858                            kAccountingNumeric
 859                            kOtherNumeric
 860                            kPrimaryNumeric
 861                            kCompatibilityVariant
 862                            kIICore
 863                            kIRG_GSource
 864                            kIRG_HSource
 865                            kIRG_JSource
 866                            kIRG_KPSource
 867                            kIRG_MSource
 868                            kIRG_KSource
 869                            kIRG_TSource
 870                            kIRG_USource
 871                            kIRG_VSource
 872                            kRSUnicode
 873                         ))
 874     {
 875         $why_suppress_if_empty_warn_if_not{$table} = $unihan;
 876     }
 877 }
 878
 879 # Enum values for to_output_map() method in the Map_Table package.
 880 my $EXTERNAL_MAP = 1;
 881 my $INTERNAL_MAP = 2;
 882 my $OUTPUT_ADJUSTED = 3;
 883
 884 # To override computed values for writing the map tables for these properties.
 885 # The default for enum map tables is to write them out, so that the Unicode
 886 # .txt files can be removed, but all the data to compute any property value
 887 # for any code point is available in a more compact form.
 888 my %global_to_output_map = (
 889     # Needed by UCD.pm, but don't want to publicize that it exists, so won't
 890     # get stuck supporting it if things change.  Since it is a STRING
 891     # property, it normally would be listed in the pod, but INTERNAL_MAP
 892     # suppresses that.
 893     Unicode_1_Name => $INTERNAL_MAP,
 894
 895     Present_In => 0,                # Suppress, as easily computed from Age
 896     Block => 0,                     # Suppress, as Blocks.txt is retained.
 897
 898     # Suppress, as mapping can be found instead from the
 899     # Perl_Decomposition_Mapping file
 900     Decomposition_Type => 0,
 901 );
 902
 903 # Properties that this program ignores.
 904 my @unimplemented_properties;
 905
 906 # With this release, it is automatically handled if the Unihan db is
 907 # downloaded
 908 push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
 909
 910 # There are several types of obsolete properties defined by Unicode.  These
 911 # must be hand-edited for every new Unicode release.
 912 my %why_deprecated;  # Generates a deprecated warning message if used.
 913 my %why_stabilized;  # Documentation only
 914 my %why_obsolete;    # Documentation only
 915
 916 {   # Closure
 917     my $simple = 'Perl uses the more complete version of this property';
 918     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 919
 920     my $other_properties = 'other properties';
 921     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 922     my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
 923
 924     %why_deprecated = (
 925         'Grapheme_Link' => 'Deprecated by Unicode:  Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
 926         'Jamo_Short_Name' => $contributory,
 927         'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 928         'Other_Alphabetic' => $contributory,
 929         'Other_Default_Ignorable_Code_Point' => $contributory,
 930         'Other_Grapheme_Extend' => $contributory,
 931         'Other_ID_Continue' => $contributory,
 932         'Other_ID_Start' => $contributory,
 933         'Other_Lowercase' => $contributory,
 934         'Other_Math' => $contributory,
 935         'Other_Uppercase' => $contributory,
 936         'Expands_On_NFC' => $why_no_expand,
 937         'Expands_On_NFD' => $why_no_expand,
 938         'Expands_On_NFKC' => $why_no_expand,
 939         'Expands_On_NFKD' => $why_no_expand,
 940     );
 941
 942     %why_suppressed = (
 943         # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
 944         # contains the same information, but without the algorithmically
 945         # determinable Hangul syllables'.  This file is not published, so it's
 946         # existence is not noted in the comment.
 947         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
 948
 949         'Indic_Matra_Category' => "Provisional",
 950         'Indic_Syllabic_Category' => "Provisional",
 951
 952         # Don't suppress ISO_Comment, as otherwise special handling is needed
 953         # to differentiate between it and gc=c, which can be written as 'isc',
 954         # which is the same characters as ISO_Comment's short name.
 955
 956         'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
 957
 958         'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
 959         'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 960         'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 961         'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 962
 963         FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
 964     );
 965
 966     foreach my $property (
 967
 968             # The following are suppressed because they were made contributory
 969             # or deprecated by Unicode before Perl ever thought about
 970             # supporting them.
 971             'Jamo_Short_Name',
 972             'Grapheme_Link',
 973             'Expands_On_NFC',
 974             'Expands_On_NFD',
 975             'Expands_On_NFKC',
 976             'Expands_On_NFKD',
 977
 978             # The following are suppressed because they have been marked
 979             # as deprecated for a sufficient amount of time
 980             'Other_Alphabetic',
 981             'Other_Default_Ignorable_Code_Point',
 982             'Other_Grapheme_Extend',
 983             'Other_ID_Continue',
 984             'Other_ID_Start',
 985             'Other_Lowercase',
 986             'Other_Math',
 987             'Other_Uppercase',
 988     ) {
 989         $why_suppressed{$property} = $why_deprecated{$property};
 990     }
 991
 992     # Customize the message for all the 'Other_' properties
 993     foreach my $property (keys %why_deprecated) {
 994         next if (my $main_property = $property) !~ s/^Other_//;
 995         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
 996     }
 997 }
 998
 999 if ($write_Unicode_deprecated_tables) {
1000     foreach my $property (keys %why_suppressed) {
1001         delete $why_suppressed{$property} if $property =~
1002                                                     / ^ Other | Grapheme /x;
1003     }
1004 }
1005
1006 if ($v_version ge 4.0.0) {
1007     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1008     if ($v_version ge 6.0.0) {
1009         $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1010     }
1011 }
1012 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1013     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1014     if ($v_version ge 6.0.0) {
1015         $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1016     }
1017 }
1018
1019 # Probably obsolete forever
1020 if ($v_version ge v4.1.0) {
1021     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
1022 }
1023 if ($v_version ge v6.0.0) {
1024     $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1025     $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1026 }
1027
1028 # This program can create files for enumerated-like properties, such as
1029 # 'Numeric_Type'.  This file would be the same format as for a string
1030 # property, with a mapping from code point to its value, so you could look up,
1031 # for example, the script a code point is in.  But no one so far wants this
1032 # mapping, or they have found another way to get it since this is a new
1033 # feature.  So no file is generated except if it is in this list.
1034 my @output_mapped_properties = split "\n", <<END;
1035 END
1036
1037 # If you are using the Unihan database in a Unicode version before 5.2, you
1038 # need to add the properties that you want to extract from it to this table.
1039 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
1040 # listed, commented out
1041 my @cjk_properties = split "\n", <<'END';
1042 #cjkAccountingNumeric; kAccountingNumeric
1043 #cjkOtherNumeric; kOtherNumeric
1044 #cjkPrimaryNumeric; kPrimaryNumeric
1045 #cjkCompatibilityVariant; kCompatibilityVariant
1046 #cjkIICore ; kIICore
1047 #cjkIRG_GSource; kIRG_GSource
1048 #cjkIRG_HSource; kIRG_HSource
1049 #cjkIRG_JSource; kIRG_JSource
1050 #cjkIRG_KPSource; kIRG_KPSource
1051 #cjkIRG_KSource; kIRG_KSource
1052 #cjkIRG_TSource; kIRG_TSource
1053 #cjkIRG_USource; kIRG_USource
1054 #cjkIRG_VSource; kIRG_VSource
1055 #cjkRSUnicode; kRSUnicode                ; Unicode_Radical_Stroke; URS
1056 END
1057
1058 # Similarly for the property values.  For your convenience, the lines in the
1059 # 6.0 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
1060 # '#' marks (for Unicode versions before 5.2)
1061 my @cjk_property_values = split "\n", <<'END';
1062 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1063 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1064 ## @missing: 0000..10FFFF; cjkIICore; <none>
1065 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1066 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1067 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1068 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1069 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1070 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1071 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1072 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1073 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1074 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1075 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1076 END
1077
1078 # The input files don't list every code point.  Those not listed are to be
1079 # defaulted to some value.  Below are hard-coded what those values are for
1080 # non-binary properties as of 5.1.  Starting in 5.0, there are
1081 # machine-parsable comment lines in the files that give the defaults; so this
1082 # list shouldn't have to be extended.  The claim is that all missing entries
1083 # for binary properties will default to 'N'.  Unicode tried to change that in
1084 # 5.2, but the beta period produced enough protest that they backed off.
1085 #
1086 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1087 # be in the form that it expects.  The others may be synonyms.
1088 my $CODE_POINT = '<code point>';
1089 my %default_mapping = (
1090     Age => "Unassigned",
1091     # Bidi_Class => Complicated; set in code
1092     Bidi_Mirroring_Glyph => "",
1093     Block => 'No_Block',
1094     Canonical_Combining_Class => 0,
1095     Case_Folding => $CODE_POINT,
1096     Decomposition_Mapping => $CODE_POINT,
1097     Decomposition_Type => 'None',
1098     East_Asian_Width => "Neutral",
1099     FC_NFKC_Closure => $CODE_POINT,
1100     General_Category => 'Cn',
1101     Grapheme_Cluster_Break => 'Other',
1102     Hangul_Syllable_Type => 'NA',
1103     ISO_Comment => "",
1104     Jamo_Short_Name => "",
1105     Joining_Group => "No_Joining_Group",
1106     # Joining_Type => Complicated; set in code
1107     kIICore => 'N',   #                       Is converted to binary
1108     #Line_Break => Complicated; set in code
1109     Lowercase_Mapping => $CODE_POINT,
1110     Name => "",
1111     Name_Alias => "",
1112     NFC_QC => 'Yes',
1113     NFD_QC => 'Yes',
1114     NFKC_QC => 'Yes',
1115     NFKD_QC => 'Yes',
1116     Numeric_Type => 'None',
1117     Numeric_Value => 'NaN',
1118     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1119     Sentence_Break => 'Other',
1120     Simple_Case_Folding => $CODE_POINT,
1121     Simple_Lowercase_Mapping => $CODE_POINT,
1122     Simple_Titlecase_Mapping => $CODE_POINT,
1123     Simple_Uppercase_Mapping => $CODE_POINT,
1124     Titlecase_Mapping => $CODE_POINT,
1125     Unicode_1_Name => "",
1126     Unicode_Radical_Stroke => "",
1127     Uppercase_Mapping => $CODE_POINT,
1128     Word_Break => 'Other',
1129 );
1130
1131 # Below are files that Unicode furnishes, but this program ignores, and why.
1132 # NormalizationCorrections.txt requires some more explanation.  It documents
1133 # the cumulative fixes to erroneous normalizations in earlier Unicode
1134 # versions.  Its main purpose is so that someone running on an earlier version
1135 # can use this file to override what got published in that earlier release.
1136 # It would be easy for mktables to read and handle this file.  But all the
1137 # corrections in it should already be in the other files for the release it
1138 # is.  To get it to actually mean something useful, someone would have to be
1139 # using an earlier Unicode release, and copy it to the files for that release
1140 # and recomplile.  So far there has been no demand to do that, so this hasn't
1141 # been implemented.
1142 my %ignored_files = (
1143     'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1144     'Index.txt' => 'Alphabetical index of Unicode characters',
1145     'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1146     'NamesList.txt' => 'Annotated list of characters',
1147     'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1148     'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1149     'ReadMe.txt' => 'Documentation',
1150     'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized.  This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1151     'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1152     'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters',
1153     'USourceData.pdf' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters',
1154     'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1155     'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1156     'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1157     'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
1158 );
1159
1160 my %skipped_files;  # List of files that we skip
1161
1162 ### End of externally interesting definitions, except for @input_file_objects
1163
1164 my $HEADER=<<"EOF";
1165 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1166 # This file is machine-generated by $0 from the Unicode
1167 # database, Version $string_version.  Any changes made here will be lost!
1168 EOF
1169
1170 my $INTERNAL_ONLY_HEADER = <<"EOF";
1171
1172 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1173 # This file is for internal use by core Perl only.  The format and even the
1174 # name or existence of this file are subject to change without notice.  Don't
1175 # use it directly.
1176 EOF
1177
1178 my $DEVELOPMENT_ONLY=<<"EOF";
1179 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1180 # This file contains information artificially constrained to code points
1181 # present in Unicode release $string_compare_versions.
1182 # IT CANNOT BE RELIED ON.  It is for use during development only and should
1183 # not be used for production.
1184
1185 EOF
1186
1187 my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1188 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1189 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1190
1191 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
1192 # two must be 10; if there are 5, the first must not be a 0.  Written this way
1193 # to decrease backtracking.  The first regex allows the code point to be at
1194 # the end of a word, but to work properly, the word shouldn't end with a valid
1195 # hex character.  The second one won't match a code point at the end of a
1196 # word, and doesn't have the run-on issue
1197 my $run_on_code_point_re =
1198             qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1199 my $code_point_re = qr/\b$run_on_code_point_re/;
1200
1201 # This matches the beginning of the line in the Unicode db files that give the
1202 # defaults for code points not listed (i.e., missing) in the file.  The code
1203 # depends on this ending with a semi-colon, so it can assume it is a valid
1204 # field when the line is split() by semi-colons
1205 my $missing_defaults_prefix =
1206             qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
1207
1208 # Property types.  Unicode has more types, but these are sufficient for our
1209 # purposes.
1210 my $UNKNOWN = -1;   # initialized to illegal value
1211 my $NON_STRING = 1; # Either binary or enum
1212 my $BINARY = 2;
1213 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1214                        # tables, additional true and false tables are
1215                        # generated so that false is anything matching the
1216                        # default value, and true is everything else.
1217 my $ENUM = 4;       # Include catalog
1218 my $STRING = 5;     # Anything else: string or misc
1219
1220 # Some input files have lines that give default values for code points not
1221 # contained in the file.  Sometimes these should be ignored.
1222 my $NO_DEFAULTS = 0;        # Must evaluate to false
1223 my $NOT_IGNORED = 1;
1224 my $IGNORED = 2;
1225
1226 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
1227 # and will appear in the main body of the tables in the output files, but
1228 # there are other types of ranges as well, listed below, that are specially
1229 # handled.   There are pseudo-types as well that will never be stored as a
1230 # type, but will affect the calculation of the type.
1231
1232 # 0 is for normal, non-specials
1233 my $MULTI_CP = 1;           # Sequence of more than code point
1234 my $HANGUL_SYLLABLE = 2;
1235 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1236 my $NULL = 4;               # The map is to the null string; utf8.c can't
1237                             # handle these, nor is there an accepted syntax
1238                             # for them in \p{} constructs
1239 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1240                              # otherwise be $MULTI_CP type are instead type 0
1241
1242 # process_generic_property_file() can accept certain overrides in its input.
1243 # Each of these must begin AND end with $CMD_DELIM.
1244 my $CMD_DELIM = "\a";
1245 my $REPLACE_CMD = 'replace';    # Override the Replace
1246 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1247
1248 my $NO = 0;
1249 my $YES = 1;
1250
1251 # Values for the Replace argument to add_range.
1252 # $NO                      # Don't replace; add only the code points not
1253                            # already present.
1254 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1255                            # the comments at the subroutine definition.
1256 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1257 my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1258                            # already there
1259 my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1260                            # already there
1261 my $CROAK = 6;             # Die with an error if is already there
1262
1263 # Flags to give property statuses.  The phrases are to remind maintainers that
1264 # if the flag is changed, the indefinite article referring to it in the
1265 # documentation may need to be as well.
1266 my $NORMAL = "";
1267 my $DEPRECATED = 'D';
1268 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1269 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1270 my $DISCOURAGED = 'X';
1271 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1272 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1273 my $STRICTER = 'T';
1274 my $a_bold_stricter = "a 'B<$STRICTER>'";
1275 my $A_bold_stricter = "A 'B<$STRICTER>'";
1276 my $STABILIZED = 'S';
1277 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1278 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1279 my $OBSOLETE = 'O';
1280 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1281 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1282
1283 my %status_past_participles = (
1284     $DISCOURAGED => 'discouraged',
1285     $STABILIZED => 'stabilized',
1286     $OBSOLETE => 'obsolete',
1287     $DEPRECATED => 'deprecated',
1288 );
1289
1290 # Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1291 # externally documented.
1292 my $ORDINARY = 0;       # The normal fate.
1293 my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1294                         # but there is a file written that can be used to
1295                         # reconstruct this table
1296 my $INTERNAL_ONLY = 2;  # The file for this table is written out, but it is
1297                         # for Perl's internal use only
1298 my $SUPPRESSED = 3;     # The file for this table is not written out, and as a
1299                         # result, we don't bother to do many computations on
1300                         # it.
1301 my $PLACEHOLDER = 4;    # Like $SUPPRESSED, but we go through all the
1302                         # computations anyway, as the values are needed for
1303                         # things to work.  This happens when we have Perl
1304                         # extensions that depend on Unicode tables that
1305                         # wouldn't normally be in a given Unicode version.
1306
1307 # The format of the values of the tables:
1308 my $EMPTY_FORMAT = "";
1309 my $BINARY_FORMAT = 'b';
1310 my $DECIMAL_FORMAT = 'd';
1311 my $FLOAT_FORMAT = 'f';
1312 my $INTEGER_FORMAT = 'i';
1313 my $HEX_FORMAT = 'x';
1314 my $RATIONAL_FORMAT = 'r';
1315 my $STRING_FORMAT = 's';
1316 my $ADJUST_FORMAT = 'a';
1317 my $DECOMP_STRING_FORMAT = 'c';
1318 my $STRING_WHITE_SPACE_LIST = 'sw';
1319
1320 my %map_table_formats = (
1321     $BINARY_FORMAT => 'binary',
1322     $DECIMAL_FORMAT => 'single decimal digit',
1323     $FLOAT_FORMAT => 'floating point number',
1324     $INTEGER_FORMAT => 'integer',
1325     $HEX_FORMAT => 'non-negative hex whole number; a code point',
1326     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1327     $STRING_FORMAT => 'string',
1328     $ADJUST_FORMAT => 'some entries need adjustment',
1329     $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1330     $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1331 );
1332
1333 # Unicode didn't put such derived files in a separate directory at first.
1334 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1335 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1336 my $AUXILIARY = 'auxiliary';
1337
1338 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1339 # and into UCD.pl for the use of UCD.pm
1340 my %loose_to_file_of;       # loosely maps table names to their respective
1341                             # files
1342 my %stricter_to_file_of;    # same; but for stricter mapping.
1343 my %loose_property_to_file_of; # Maps a loose property name to its map file
1344 my %file_to_swash_name;     # Maps the file name to its corresponding key name
1345                             # in the hash %utf8::SwashInfo
1346 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1347                              # their rational equivalent
1348 my %loose_property_name_of; # Loosely maps (non_string) property names to
1349                             # standard form
1350 my %string_property_loose_to_name; # Same, for string properties.
1351 my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1352                             # the property name in standard loose form, and
1353                             # 'value' is the default value for that property,
1354                             # also in standard loose form.
1355 my %loose_to_standard_value; # loosely maps table names to the canonical
1356                             # alias for them
1357 my %ambiguous_names;        # keys are alias names (in standard form) that
1358                             # have more than one possible meaning.
1359 my %prop_aliases;           # Keys are standard property name; values are each
1360                             # one's aliases
1361 my %prop_value_aliases;     # Keys of top level are standard property name;
1362                             # values are keys to another hash,  Each one is
1363                             # one of the property's values, in standard form.
1364                             # The values are that prop-val's aliases.
1365 my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1366
1367 # Most properties are immune to caseless matching, otherwise you would get
1368 # nonsensical results, as properties are a function of a code point, not
1369 # everything that is caselessly equivalent to that code point.  For example,
1370 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1371 # be true because 's' and 'S' are equivalent caselessly.  However,
1372 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1373 # extend that concept to those very few properties that are like this.  Each
1374 # such property will match the full range caselessly.  They are hard-coded in
1375 # the program; it's not worth trying to make it general as it's extremely
1376 # unlikely that they will ever change.
1377 my %caseless_equivalent_to;
1378
1379 # These constants names and values were taken from the Unicode standard,
1380 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1381 # syllables.  The '_string' versions are so generated tables can retain the
1382 # hex format, which is the more familiar value
1383 my $SBase_string = "0xAC00";
1384 my $SBase = CORE::hex $SBase_string;
1385 my $LBase_string = "0x1100";
1386 my $LBase = CORE::hex $LBase_string;
1387 my $VBase_string = "0x1161";
1388 my $VBase = CORE::hex $VBase_string;
1389 my $TBase_string = "0x11A7";
1390 my $TBase = CORE::hex $TBase_string;
1391 my $SCount = 11172;
1392 my $LCount = 19;
1393 my $VCount = 21;
1394 my $TCount = 28;
1395 my $NCount = $VCount * $TCount;
1396
1397 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1398 # with the above published constants.
1399 my %Jamo;
1400 my %Jamo_L;     # Leading consonants
1401 my %Jamo_V;     # Vowels
1402 my %Jamo_T;     # Trailing consonants
1403
1404 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1405 # The key is the base name of the code point, and the value is an
1406 # array giving all the ranges that use this base name.  Each range
1407 # is actually a hash giving the 'low' and 'high' values of it.
1408 my %names_ending_in_code_point;
1409 my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1410                                         # removed from the names
1411 # Inverse mapping.  The list of ranges that have these kinds of
1412 # names.  Each element contains the low, high, and base names in an
1413 # anonymous hash.
1414 my @code_points_ending_in_code_point;
1415
1416 # To hold Unicode's normalization test suite
1417 my @normalization_tests;
1418
1419 # Boolean: does this Unicode version have the hangul syllables, and are we
1420 # writing out a table for them?
1421 my $has_hangul_syllables = 0;
1422
1423 # Does this Unicode version have code points whose names end in their
1424 # respective code points, and are we writing out a table for them?  0 for no;
1425 # otherwise points to first property that a table is needed for them, so that
1426 # if multiple tables are needed, we don't create duplicates
1427 my $needing_code_points_ending_in_code_point = 0;
1428
1429 my @backslash_X_tests;     # List of tests read in for testing \X
1430 my @unhandled_properties;  # Will contain a list of properties found in
1431                            # the input that we didn't process.
1432 my @match_properties;      # Properties that have match tables, to be
1433                            # listed in the pod
1434 my @map_properties;        # Properties that get map files written
1435 my @named_sequences;       # NamedSequences.txt contents.
1436 my %potential_files;       # Generated list of all .txt files in the directory
1437                            # structure so we can warn if something is being
1438                            # ignored.
1439 my @files_actually_output; # List of files we generated.
1440 my @more_Names;            # Some code point names are compound; this is used
1441                            # to store the extra components of them.
1442 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1443                            # the minimum before we consider it equivalent to a
1444                            # candidate rational
1445 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1446
1447 # These store references to certain commonly used property objects
1448 my $gc;
1449 my $perl;
1450 my $block;
1451 my $perl_charname;
1452 my $print;
1453 my $Any;
1454 my $script;
1455
1456 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1457 my $has_In_conflicts = 0;
1458 my $has_Is_conflicts = 0;
1459
1460 sub internal_file_to_platform ($) {
1461     # Convert our file paths which have '/' separators to those of the
1462     # platform.
1463
1464     my $file = shift;
1465     return undef unless defined $file;
1466
1467     return File::Spec->join(split '/', $file);
1468 }
1469
1470 sub file_exists ($) {   # platform independent '-e'.  This program internally
1471                         # uses slash as a path separator.
1472     my $file = shift;
1473     return 0 if ! defined $file;
1474     return -e internal_file_to_platform($file);
1475 }
1476
1477 sub objaddr($) {
1478     # Returns the address of the blessed input object.
1479     # It doesn't check for blessedness because that would do a string eval
1480     # every call, and the program is structured so that this is never called
1481     # for a non-blessed object.
1482
1483     no overloading; # If overloaded, numifying below won't work.
1484
1485     # Numifying a ref gives its address.
1486     return pack 'J', $_[0];
1487 }
1488
1489 # These are used only if $annotate is true.
1490 # The entire range of Unicode characters is examined to populate these
1491 # after all the input has been processed.  But most can be skipped, as they
1492 # have the same descriptive phrases, such as being unassigned
1493 my @viacode;            # Contains the 1 million character names
1494 my @printable;          # boolean: And are those characters printable?
1495 my @annotate_char_type; # Contains a type of those characters, specifically
1496                         # for the purposes of annotation.
1497 my $annotate_ranges;    # A map of ranges of code points that have the same
1498                         # name for the purposes of annotation.  They map to the
1499                         # upper edge of the range, so that the end point can
1500                         # be immediately found.  This is used to skip ahead to
1501                         # the end of a range, and avoid processing each
1502                         # individual code point in it.
1503 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1504                                    # characters, but excluding those which are
1505                                    # also noncharacter code points
1506
1507 # The annotation types are an extension of the regular range types, though
1508 # some of the latter are folded into one.  Make the new types negative to
1509 # avoid conflicting with the regular types
1510 my $SURROGATE_TYPE = -1;
1511 my $UNASSIGNED_TYPE = -2;
1512 my $PRIVATE_USE_TYPE = -3;
1513 my $NONCHARACTER_TYPE = -4;
1514 my $CONTROL_TYPE = -5;
1515 my $UNKNOWN_TYPE = -6;  # Used only if there is a bug in this program
1516
1517 sub populate_char_info ($) {
1518     # Used only with the $annotate option.  Populates the arrays with the
1519     # input code point's info that are needed for outputting more detailed
1520     # comments.  If calling context wants a return, it is the end point of
1521     # any contiguous range of characters that share essentially the same info
1522
1523     my $i = shift;
1524     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1525
1526     $viacode[$i] = $perl_charname->value_of($i) || "";
1527
1528     # A character is generally printable if Unicode says it is,
1529     # but below we make sure that most Unicode general category 'C' types
1530     # aren't.
1531     $printable[$i] = $print->contains($i);
1532
1533     $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1534
1535     # Only these two regular types are treated specially for annotations
1536     # purposes
1537     $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1538                                 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1539
1540     # Give a generic name to all code points that don't have a real name.
1541     # We output ranges, if applicable, for these.  Also calculate the end
1542     # point of the range.
1543     my $end;
1544     if (! $viacode[$i]) {
1545         my $nonchar;
1546         if ($gc-> table('Private_use')->contains($i)) {
1547             $viacode[$i] = 'Private Use';
1548             $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1549             $printable[$i] = 0;
1550             $end = $gc->table('Private_Use')->containing_range($i)->end;
1551         }
1552         elsif ((defined ($nonchar =
1553                             Property::property_ref('Noncharacter_Code_Point'))
1554                && $nonchar->table('Y')->contains($i)))
1555         {
1556             $viacode[$i] = 'Noncharacter';
1557             $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1558             $printable[$i] = 0;
1559             $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1560                                                     containing_range($i)->end;
1561         }
1562         elsif ($gc-> table('Control')->contains($i)) {
1563             $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control';
1564             $annotate_char_type[$i] = $CONTROL_TYPE;
1565             $printable[$i] = 0;
1566         }
1567         elsif ($gc-> table('Unassigned')->contains($i)) {
1568             $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1569             $printable[$i] = 0;
1570             if ($v_version lt v2.0.0) { # No blocks in earliest releases
1571                 $viacode[$i] = 'Unassigned';
1572                 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1573             }
1574             else {
1575                 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1576
1577                 # Because we name the unassigned by the blocks they are in, it
1578                 # can't go past the end of that block, and it also can't go
1579                 # past the unassigned range it is in.  The special table makes
1580                 # sure that the non-characters, which are unassigned, are
1581                 # separated out.
1582                 $end = min($block->containing_range($i)->end,
1583                            $unassigned_sans_noncharacters->
1584                                                     containing_range($i)->end);
1585             }
1586         }
1587         elsif ($v_version lt v2.0.0) {  # No surrogates in earliest releases
1588             $viacode[$i] = $gc->value_of($i);
1589             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1590             $printable[$i] = 0;
1591         }
1592         elsif ($gc-> table('Surrogate')->contains($i)) {
1593             $viacode[$i] = 'Surrogate';
1594             $annotate_char_type[$i] = $SURROGATE_TYPE;
1595             $printable[$i] = 0;
1596             $end = $gc->table('Surrogate')->containing_range($i)->end;
1597         }
1598         else {
1599             Carp::my_carp_bug("Can't figure out how to annotate "
1600                               . sprintf("U+%04X", $i)
1601                               . ".  Proceeding anyway.");
1602             $viacode[$i] = 'UNKNOWN';
1603             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1604             $printable[$i] = 0;
1605         }
1606     }
1607
1608     # Here, has a name, but if it's one in which the code point number is
1609     # appended to the name, do that.
1610     elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1611         $viacode[$i] .= sprintf("-%04X", $i);
1612         $end = $perl_charname->containing_range($i)->end;
1613     }
1614
1615     # And here, has a name, but if it's a hangul syllable one, replace it with
1616     # the correct name from the Unicode algorithm
1617     elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1618         use integer;
1619         my $SIndex = $i - $SBase;
1620         my $L = $LBase + $SIndex / $NCount;
1621         my $V = $VBase + ($SIndex % $NCount) / $TCount;
1622         my $T = $TBase + $SIndex % $TCount;
1623         $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1624         $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1625         $end = $perl_charname->containing_range($i)->end;
1626     }
1627
1628     return if ! defined wantarray;
1629     return $i if ! defined $end;    # If not a range, return the input
1630
1631     # Save this whole range so can find the end point quickly
1632     $annotate_ranges->add_map($i, $end, $end);
1633
1634     return $end;
1635 }
1636
1637 # Commented code below should work on Perl 5.8.
1638 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1639 ## the native perl version of it (which is what would operate under miniperl)
1640 ## is extremely slow, as it does a string eval every call.
1641 #my $has_fast_scalar_util = $\18 !~ /miniperl/
1642 #                            && defined eval "require Scalar::Util";
1643 #
1644 #sub objaddr($) {
1645 #    # Returns the address of the blessed input object.  Uses the XS version if
1646 #    # available.  It doesn't check for blessedness because that would do a
1647 #    # string eval every call, and the program is structured so that this is
1648 #    # never called for a non-blessed object.
1649 #
1650 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1651 #
1652 #    # Check at least that is a ref.
1653 #    my $pkg = ref($_[0]) or return undef;
1654 #
1655 #    # Change to a fake package to defeat any overloaded stringify
1656 #    bless $_[0], 'main::Fake';
1657 #
1658 #    # Numifying a ref gives its address.
1659 #    my $addr = pack 'J', $_[0];
1660 #
1661 #    # Return to original class
1662 #    bless $_[0], $pkg;
1663 #    return $addr;
1664 #}
1665
1666 sub max ($$) {
1667     my $a = shift;
1668     my $b = shift;
1669     return $a if $a >= $b;
1670     return $b;
1671 }
1672
1673 sub min ($$) {
1674     my $a = shift;
1675     my $b = shift;
1676     return $a if $a <= $b;
1677     return $b;
1678 }
1679
1680 sub clarify_number ($) {
1681     # This returns the input number with underscores inserted every 3 digits
1682     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1683     # checked.
1684
1685     my $number = shift;
1686     my $pos = length($number) - 3;
1687     return $number if $pos <= 1;
1688     while ($pos > 0) {
1689         substr($number, $pos, 0) = '_';
1690         $pos -= 3;
1691     }
1692     return $number;
1693 }
1694
1695
1696 package Carp;
1697
1698 # These routines give a uniform treatment of messages in this program.  They
1699 # are placed in the Carp package to cause the stack trace to not include them,
1700 # although an alternative would be to use another package and set @CARP_NOT
1701 # for it.
1702
1703 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1704
1705 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1706 # and overload trying to load Scalar:Util under miniperl.  See
1707 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1708 undef $overload::VERSION;
1709
1710 sub my_carp {
1711     my $message = shift || "";
1712     my $nofold = shift || 0;
1713
1714     if ($message) {
1715         $message = main::join_lines($message);
1716         $message =~ s/^$0: *//;     # Remove initial program name
1717         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1718         $message = "\n$0: $message;";
1719
1720         # Fold the message with program name, semi-colon end punctuation
1721         # (which looks good with the message that carp appends to it), and a
1722         # hanging indent for continuation lines.
1723         $message = main::simple_fold($message, "", 4) unless $nofold;
1724         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1725                                     # appends is to the same line
1726     }
1727
1728     return $message if defined wantarray;   # If a caller just wants the msg
1729
1730     carp $message;
1731     return;
1732 }
1733
1734 sub my_carp_bug {
1735     # This is called when it is clear that the problem is caused by a bug in
1736     # this program.
1737
1738     my $message = shift;
1739     $message =~ s/^$0: *//;
1740     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1741     carp $message;
1742     return;
1743 }
1744
1745 sub carp_too_few_args {
1746     if (@_ != 2) {
1747         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1748         return;
1749     }
1750
1751     my $args_ref = shift;
1752     my $count = shift;
1753
1754     my_carp_bug("Need at least $count arguments to "
1755         . (caller 1)[3]
1756         . ".  Instead got: '"
1757         . join ', ', @$args_ref
1758         . "'.  No action taken.");
1759     return;
1760 }
1761
1762 sub carp_extra_args {
1763     my $args_ref = shift;
1764     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1765
1766     unless (ref $args_ref) {
1767         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1768         return;
1769     }
1770     my ($package, $file, $line) = caller;
1771     my $subroutine = (caller 1)[3];
1772
1773     my $list;
1774     if (ref $args_ref eq 'HASH') {
1775         foreach my $key (keys %$args_ref) {
1776             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1777         }
1778         $list = join ', ', each %{$args_ref};
1779     }
1780     elsif (ref $args_ref eq 'ARRAY') {
1781         foreach my $arg (@$args_ref) {
1782             $arg = $UNDEF unless defined $arg;
1783         }
1784         $list = join ', ', @$args_ref;
1785     }
1786     else {
1787         my_carp_bug("Can't cope with ref "
1788                 . ref($args_ref)
1789                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1790         return;
1791     }
1792
1793     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1794     return;
1795 }
1796
1797 package main;
1798
1799 { # Closure
1800
1801     # This program uses the inside-out method for objects, as recommended in
1802     # "Perl Best Practices".  This closure aids in generating those.  There
1803     # are two routines.  setup_package() is called once per package to set
1804     # things up, and then set_access() is called for each hash representing a
1805     # field in the object.  These routines arrange for the object to be
1806     # properly destroyed when no longer used, and for standard accessor
1807     # functions to be generated.  If you need more complex accessors, just
1808     # write your own and leave those accesses out of the call to set_access().
1809     # More details below.
1810
1811     my %constructor_fields; # fields that are to be used in constructors; see
1812                             # below
1813
1814     # The values of this hash will be the package names as keys to other
1815     # hashes containing the name of each field in the package as keys, and
1816     # references to their respective hashes as values.
1817     my %package_fields;
1818
1819     sub setup_package {
1820         # Sets up the package, creating standard DESTROY and dump methods
1821         # (unless already defined).  The dump method is used in debugging by
1822         # simple_dumper().
1823         # The optional parameters are:
1824         #   a)  a reference to a hash, that gets populated by later
1825         #       set_access() calls with one of the accesses being
1826         #       'constructor'.  The caller can then refer to this, but it is
1827         #       not otherwise used by these two routines.
1828         #   b)  a reference to a callback routine to call during destruction
1829         #       of the object, before any fields are actually destroyed
1830
1831         my %args = @_;
1832         my $constructor_ref = delete $args{'Constructor_Fields'};
1833         my $destroy_callback = delete $args{'Destroy_Callback'};
1834         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1835
1836         my %fields;
1837         my $package = (caller)[0];
1838
1839         $package_fields{$package} = \%fields;
1840         $constructor_fields{$package} = $constructor_ref;
1841
1842         unless ($package->can('DESTROY')) {
1843             my $destroy_name = "${package}::DESTROY";
1844             no strict "refs";
1845
1846             # Use typeglob to give the anonymous subroutine the name we want
1847             *$destroy_name = sub {
1848                 my $self = shift;
1849                 my $addr = do { no overloading; pack 'J', $self; };
1850
1851                 $self->$destroy_callback if $destroy_callback;
1852                 foreach my $field (keys %{$package_fields{$package}}) {
1853                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1854                     delete $package_fields{$package}{$field}{$addr};
1855                 }
1856                 return;
1857             }
1858         }
1859
1860         unless ($package->can('dump')) {
1861             my $dump_name = "${package}::dump";
1862             no strict "refs";
1863             *$dump_name = sub {
1864                 my $self = shift;
1865                 return dump_inside_out($self, $package_fields{$package}, @_);
1866             }
1867         }
1868         return;
1869     }
1870
1871     sub set_access {
1872         # Arrange for the input field to be garbage collected when no longer
1873         # needed.  Also, creates standard accessor functions for the field
1874         # based on the optional parameters-- none if none of these parameters:
1875         #   'addable'    creates an 'add_NAME()' accessor function.
1876         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1877         #                function.
1878         #   'settable'   creates a 'set_NAME()' accessor function.
1879         #   'constructor' doesn't create an accessor function, but adds the
1880         #                field to the hash that was previously passed to
1881         #                setup_package();
1882         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1883         # 'add' etc. all mean 'addable'.
1884         # The read accessor function will work on both array and scalar
1885         # values.  If another accessor in the parameter list is 'a', the read
1886         # access assumes an array.  You can also force it to be array access
1887         # by specifying 'readable_array' instead of 'readable'
1888         #
1889         # A sort-of 'protected' access can be set-up by preceding the addable,
1890         # readable or settable with some initial portion of 'protected_' (but,
1891         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1892         # "protection" is only by convention.  All that happens is that the
1893         # accessor functions' names begin with an underscore.  So instead of
1894         # calling set_foo, the call is _set_foo.  (Real protection could be
1895         # accomplished by having a new subroutine, end_package, called at the
1896         # end of each package, and then storing the __LINE__ ranges and
1897         # checking them on every accessor.  But that is way overkill.)
1898
1899         # We create anonymous subroutines as the accessors and then use
1900         # typeglobs to assign them to the proper package and name
1901
1902         my $name = shift;   # Name of the field
1903         my $field = shift;  # Reference to the inside-out hash containing the
1904                             # field
1905
1906         my $package = (caller)[0];
1907
1908         if (! exists $package_fields{$package}) {
1909             croak "$0: Must call 'setup_package' before 'set_access'";
1910         }
1911
1912         # Stash the field so DESTROY can get it.
1913         $package_fields{$package}{$name} = $field;
1914
1915         # Remaining arguments are the accessors.  For each...
1916         foreach my $access (@_) {
1917             my $access = lc $access;
1918
1919             my $protected = "";
1920
1921             # Match the input as far as it goes.
1922             if ($access =~ /^(p[^_]*)_/) {
1923                 $protected = $1;
1924                 if (substr('protected_', 0, length $protected)
1925                     eq $protected)
1926                 {
1927
1928                     # Add 1 for the underscore not included in $protected
1929                     $access = substr($access, length($protected) + 1);
1930                     $protected = '_';
1931                 }
1932                 else {
1933                     $protected = "";
1934                 }
1935             }
1936
1937             if (substr('addable', 0, length $access) eq $access) {
1938                 my $subname = "${package}::${protected}add_$name";
1939                 no strict "refs";
1940
1941                 # add_ accessor.  Don't add if already there, which we
1942                 # determine using 'eq' for scalars and '==' otherwise.
1943                 *$subname = sub {
1944                     use strict "refs";
1945                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1946                     my $self = shift;
1947                     my $value = shift;
1948                     my $addr = do { no overloading; pack 'J', $self; };
1949                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1950                     if (ref $value) {
1951                         return if grep { $value == $_ } @{$field->{$addr}};
1952                     }
1953                     else {
1954                         return if grep { $value eq $_ } @{$field->{$addr}};
1955                     }
1956                     push @{$field->{$addr}}, $value;
1957                     return;
1958                 }
1959             }
1960             elsif (substr('constructor', 0, length $access) eq $access) {
1961                 if ($protected) {
1962                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
1963                 }
1964                 else {
1965                     $constructor_fields{$package}{$name} = $field;
1966                 }
1967             }
1968             elsif (substr('readable_array', 0, length $access) eq $access) {
1969
1970                 # Here has read access.  If one of the other parameters for
1971                 # access is array, or this one specifies array (by being more
1972                 # than just 'readable_'), then create a subroutine that
1973                 # assumes the data is an array.  Otherwise just a scalar
1974                 my $subname = "${package}::${protected}$name";
1975                 if (grep { /^a/i } @_
1976                     or length($access) > length('readable_'))
1977                 {
1978                     no strict "refs";
1979                     *$subname = sub {
1980                         use strict "refs";
1981                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1982                         my $addr = do { no overloading; pack 'J', $_[0]; };
1983                         if (ref $field->{$addr} ne 'ARRAY') {
1984                             my $type = ref $field->{$addr};
1985                             $type = 'scalar' unless $type;
1986                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1987                             return;
1988                         }
1989                         return scalar @{$field->{$addr}} unless wantarray;
1990
1991                         # Make a copy; had problems with caller modifying the
1992                         # original otherwise
1993                         my @return = @{$field->{$addr}};
1994                         return @return;
1995                     }
1996                 }
1997                 else {
1998
1999                     # Here not an array value, a simpler function.
2000                     no strict "refs";
2001                     *$subname = sub {
2002                         use strict "refs";
2003                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2004                         no overloading;
2005                         return $field->{pack 'J', $_[0]};
2006                     }
2007                 }
2008             }
2009             elsif (substr('settable', 0, length $access) eq $access) {
2010                 my $subname = "${package}::${protected}set_$name";
2011                 no strict "refs";
2012                 *$subname = sub {
2013                     use strict "refs";
2014                     if (main::DEBUG) {
2015                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2016                         Carp::carp_extra_args(\@_) if @_ > 2;
2017                     }
2018                     # $self is $_[0]; $value is $_[1]
2019                     no overloading;
2020                     $field->{pack 'J', $_[0]} = $_[1];
2021                     return;
2022                 }
2023             }
2024             else {
2025                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
2026             }
2027         }
2028         return;
2029     }
2030 }
2031
2032 package Input_file;
2033
2034 # All input files use this object, which stores various attributes about them,
2035 # and provides for convenient, uniform handling.  The run method wraps the
2036 # processing.  It handles all the bookkeeping of opening, reading, and closing
2037 # the file, returning only significant input lines.
2038 #
2039 # Each object gets a handler which processes the body of the file, and is
2040 # called by run().  Most should use the generic, default handler, which has
2041 # code scrubbed to handle things you might not expect.  A handler should
2042 # basically be a while(next_line()) {...} loop.
2043 #
2044 # You can also set up handlers to
2045 #   1) call before the first line is read for pre processing
2046 #   2) call to adjust each line of the input before the main handler gets them
2047 #   3) call upon EOF before the main handler exits its loop
2048 #   4) call at the end for post processing
2049 #
2050 # $_ is used to store the input line, and is to be filtered by the
2051 # each_line_handler()s.  So, if the format of the line is not in the desired
2052 # format for the main handler, these are used to do that adjusting.  They can
2053 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2054 # so the $_ output of one is used as the input to the next.  None of the other
2055 # handlers are stackable, but could easily be changed to be so.
2056 #
2057 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2058 # which insert the parameters as lines to be processed before the next input
2059 # file line is read.  This allows the EOF handler to flush buffers, for
2060 # example.  The difference between the two routines is that the lines inserted
2061 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
2062 # called it from such a handler, you would get infinite recursion.)  Lines
2063 # inserted by insert_adjusted_lines() go directly to the main handler without
2064 # any adjustments.  If the  post-processing handler calls any of these, there
2065 # will be no effect.  Some error checking for these conditions could be added,
2066 # but it hasn't been done.
2067 #
2068 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2069 # to prevent further processing of the line.  This routine will output the
2070 # message as a warning once, and then keep a count of the lines that have the
2071 # same message, and output that count at the end of the file's processing.
2072 # This keeps the number of messages down to a manageable amount.
2073 #
2074 # get_missings() should be called to retrieve any @missing input lines.
2075 # Messages will be raised if this isn't done if the options aren't to ignore
2076 # missings.
2077
2078 sub trace { return main::trace(@_); }
2079
2080 { # Closure
2081     # Keep track of fields that are to be put into the constructor.
2082     my %constructor_fields;
2083
2084     main::setup_package(Constructor_Fields => \%constructor_fields);
2085
2086     my %file; # Input file name, required
2087     main::set_access('file', \%file, qw{ c r });
2088
2089     my %first_released; # Unicode version file was first released in, required
2090     main::set_access('first_released', \%first_released, qw{ c r });
2091
2092     my %handler;    # Subroutine to process the input file, defaults to
2093                     # 'process_generic_property_file'
2094     main::set_access('handler', \%handler, qw{ c });
2095
2096     my %property;
2097     # name of property this file is for.  defaults to none, meaning not
2098     # applicable, or is otherwise determinable, for example, from each line.
2099     main::set_access('property', \%property, qw{ c r });
2100
2101     my %optional;
2102     # If this is true, the file is optional.  If not present, no warning is
2103     # output.  If it is present, the string given by this parameter is
2104     # evaluated, and if false the file is not processed.
2105     main::set_access('optional', \%optional, 'c', 'r');
2106
2107     my %non_skip;
2108     # This is used for debugging, to skip processing of all but a few input
2109     # files.  Add 'non_skip => 1' to the constructor for those files you want
2110     # processed when you set the $debug_skip global.
2111     main::set_access('non_skip', \%non_skip, 'c');
2112
2113     my %skip;
2114     # This is used to skip processing of this input file semi-permanently,
2115     # when it evaluates to true.  The value should be the reason the file is
2116     # being skipped.  It is used for files that we aren't planning to process
2117     # anytime soon, but want to allow to be in the directory and not raise a
2118     # message that we are not handling.  Mostly for test files.  This is in
2119     # contrast to the non_skip element, which is supposed to be used very
2120     # temporarily for debugging.  Sets 'optional' to 1.  Also, files that we
2121     # pretty much will never look at can be placed in the global
2122     # %ignored_files instead.  Ones used here will be added to %skipped files
2123     main::set_access('skip', \%skip, 'c');
2124
2125     my %each_line_handler;
2126     # list of subroutines to look at and filter each non-comment line in the
2127     # file.  defaults to none.  The subroutines are called in order, each is
2128     # to adjust $_ for the next one, and the final one adjusts it for
2129     # 'handler'
2130     main::set_access('each_line_handler', \%each_line_handler, 'c');
2131
2132     my %has_missings_defaults;
2133     # ? Are there lines in the file giving default values for code points
2134     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2135     # the norm, but IGNORED means it has such lines, but the handler doesn't
2136     # use them.  Having these three states allows us to catch changes to the
2137     # UCD that this program should track
2138     main::set_access('has_missings_defaults',
2139                                         \%has_missings_defaults, qw{ c r });
2140
2141     my %pre_handler;
2142     # Subroutine to call before doing anything else in the file.  If undef, no
2143     # such handler is called.
2144     main::set_access('pre_handler', \%pre_handler, qw{ c });
2145
2146     my %eof_handler;
2147     # Subroutine to call upon getting an EOF on the input file, but before
2148     # that is returned to the main handler.  This is to allow buffers to be
2149     # flushed.  The handler is expected to call insert_lines() or
2150     # insert_adjusted() with the buffered material
2151     main::set_access('eof_handler', \%eof_handler, qw{ c r });
2152
2153     my %post_handler;
2154     # Subroutine to call after all the lines of the file are read in and
2155     # processed.  If undef, no such handler is called.
2156     main::set_access('post_handler', \%post_handler, qw{ c });
2157
2158     my %progress_message;
2159     # Message to print to display progress in lieu of the standard one
2160     main::set_access('progress_message', \%progress_message, qw{ c });
2161
2162     my %handle;
2163     # cache open file handle, internal.  Is undef if file hasn't been
2164     # processed at all, empty if has;
2165     main::set_access('handle', \%handle);
2166
2167     my %added_lines;
2168     # cache of lines added virtually to the file, internal
2169     main::set_access('added_lines', \%added_lines);
2170
2171     my %errors;
2172     # cache of errors found, internal
2173     main::set_access('errors', \%errors);
2174
2175     my %missings;
2176     # storage of '@missing' defaults lines
2177     main::set_access('missings', \%missings);
2178
2179     sub new {
2180         my $class = shift;
2181
2182         my $self = bless \do{ my $anonymous_scalar }, $class;
2183         my $addr = do { no overloading; pack 'J', $self; };
2184
2185         # Set defaults
2186         $handler{$addr} = \&main::process_generic_property_file;
2187         $non_skip{$addr} = 0;
2188         $skip{$addr} = 0;
2189         $has_missings_defaults{$addr} = $NO_DEFAULTS;
2190         $handle{$addr} = undef;
2191         $added_lines{$addr} = [ ];
2192         $each_line_handler{$addr} = [ ];
2193         $errors{$addr} = { };
2194         $missings{$addr} = [ ];
2195
2196         # Two positional parameters.
2197         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2198         $file{$addr} = main::internal_file_to_platform(shift);
2199         $first_released{$addr} = shift;
2200
2201         # The rest of the arguments are key => value pairs
2202         # %constructor_fields has been set up earlier to list all possible
2203         # ones.  Either set or push, depending on how the default has been set
2204         # up just above.
2205         my %args = @_;
2206         foreach my $key (keys %args) {
2207             my $argument = $args{$key};
2208
2209             # Note that the fields are the lower case of the constructor keys
2210             my $hash = $constructor_fields{lc $key};
2211             if (! defined $hash) {
2212                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2213                 next;
2214             }
2215             if (ref $hash->{$addr} eq 'ARRAY') {
2216                 if (ref $argument eq 'ARRAY') {
2217                     foreach my $argument (@{$argument}) {
2218                         next if ! defined $argument;
2219                         push @{$hash->{$addr}}, $argument;
2220                     }
2221                 }
2222                 else {
2223                     push @{$hash->{$addr}}, $argument if defined $argument;
2224                 }
2225             }
2226             else {
2227                 $hash->{$addr} = $argument;
2228             }
2229             delete $args{$key};
2230         };
2231
2232         # If the file has a property for it, it means that the property is not
2233         # listed in the file's entries.  So add a handler to the list of line
2234         # handlers to insert the property name into the lines, to provide a
2235         # uniform interface to the final processing subroutine.
2236         # the final code doesn't have to worry about that.
2237         if ($property{$addr}) {
2238             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2239         }
2240
2241         if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2242             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2243         }
2244
2245         # If skipping, set to optional, and add to list of ignored files,
2246         # including its reason
2247         if ($skip{$addr}) {
2248             $optional{$addr} = 1;
2249             $skipped_files{$file{$addr}} = $skip{$addr}
2250         }
2251
2252         return $self;
2253     }
2254
2255
2256     use overload
2257         fallback => 0,
2258         qw("") => "_operator_stringify",
2259         "." => \&main::_operator_dot,
2260         ".=" => \&main::_operator_dot_equal,
2261     ;
2262
2263     sub _operator_stringify {
2264         my $self = shift;
2265
2266         return __PACKAGE__ . " object for " . $self->file;
2267     }
2268
2269     # flag to make sure extracted files are processed early
2270     my $seen_non_extracted_non_age = 0;
2271
2272     sub run {
2273         # Process the input object $self.  This opens and closes the file and
2274         # calls all the handlers for it.  Currently,  this can only be called
2275         # once per file, as it destroy's the EOF handler
2276
2277         my $self = shift;
2278         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2279
2280         my $addr = do { no overloading; pack 'J', $self; };
2281
2282         my $file = $file{$addr};
2283
2284         # Don't process if not expecting this file (because released later
2285         # than this Unicode version), and isn't there.  This means if someone
2286         # copies it into an earlier version's directory, we will go ahead and
2287         # process it.
2288         return if $first_released{$addr} gt $v_version && ! -e $file;
2289
2290         # If in debugging mode and this file doesn't have the non-skip
2291         # flag set, and isn't one of the critical files, skip it.
2292         if ($debug_skip
2293             && $first_released{$addr} ne v0
2294             && ! $non_skip{$addr})
2295         {
2296             print "Skipping $file in debugging\n" if $verbosity;
2297             return;
2298         }
2299
2300         # File could be optional
2301         if ($optional{$addr}) {
2302             return unless -e $file;
2303             my $result = eval $optional{$addr};
2304             if (! defined $result) {
2305                 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}.  $file Skipped.");
2306                 return;
2307             }
2308             if (! $result) {
2309                 if ($verbosity) {
2310                     print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2311                 }
2312                 return;
2313             }
2314         }
2315
2316         if (! defined $file || ! -e $file) {
2317
2318             # If the file doesn't exist, see if have internal data for it
2319             # (based on first_released being 0).
2320             if ($first_released{$addr} eq v0) {
2321                 $handle{$addr} = 'pretend_is_open';
2322             }
2323             else {
2324                 if (! $optional{$addr}  # File could be optional
2325                     && $v_version ge $first_released{$addr})
2326                 {
2327                     print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2328                 }
2329                 return;
2330             }
2331         }
2332         else {
2333
2334             # Here, the file exists.  Some platforms may change the case of
2335             # its name
2336             if ($seen_non_extracted_non_age) {
2337                 if ($file =~ /$EXTRACTED/i) {
2338                     Carp::my_carp_bug(main::join_lines(<<END
2339 $file should be processed just after the 'Prop...Alias' files, and before
2340 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2341 have subtle problems
2342 END
2343                     ));
2344                 }
2345             }
2346             elsif ($EXTRACTED_DIR
2347                     && $first_released{$addr} ne v0
2348                     && $file !~ /$EXTRACTED/i
2349                     && lc($file) ne 'dage.txt')
2350             {
2351                 # We don't set this (by the 'if' above) if we have no
2352                 # extracted directory, so if running on an early version,
2353                 # this test won't work.  Not worth worrying about.
2354                 $seen_non_extracted_non_age = 1;
2355             }
2356
2357             # And mark the file as having being processed, and warn if it
2358             # isn't a file we are expecting.  As we process the files,
2359             # they are deleted from the hash, so any that remain at the
2360             # end of the program are files that we didn't process.
2361             my $fkey = File::Spec->rel2abs($file);
2362             my $expecting = delete $potential_files{lc($fkey)};
2363
2364             Carp::my_carp("Was not expecting '$file'.") if
2365                     ! $expecting
2366                     && ! defined $handle{$addr};
2367
2368             # Having deleted from expected files, we can quit if not to do
2369             # anything.  Don't print progress unless really want verbosity
2370             if ($skip{$addr}) {
2371                 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2372                 return;
2373             }
2374
2375             # Open the file, converting the slashes used in this program
2376             # into the proper form for the OS
2377             my $file_handle;
2378             if (not open $file_handle, "<", $file) {
2379                 Carp::my_carp("Can't open $file.  Skipping: $!");
2380                 return 0;
2381             }
2382             $handle{$addr} = $file_handle; # Cache the open file handle
2383         }
2384
2385         if ($verbosity >= $PROGRESS) {
2386             if ($progress_message{$addr}) {
2387                 print "$progress_message{$addr}\n";
2388             }
2389             else {
2390                 # If using a virtual file, say so.
2391                 print "Processing ", (-e $file)
2392                                        ? $file
2393                                        : "substitute $file",
2394                                      "\n";
2395             }
2396         }
2397
2398
2399         # Call any special handler for before the file.
2400         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2401
2402         # Then the main handler
2403         &{$handler{$addr}}($self);
2404
2405         # Then any special post-file handler.
2406         &{$post_handler{$addr}}($self) if $post_handler{$addr};
2407
2408         # If any errors have been accumulated, output the counts (as the first
2409         # error message in each class was output when it was encountered).
2410         if ($errors{$addr}) {
2411             my $total = 0;
2412             my $types = 0;
2413             foreach my $error (keys %{$errors{$addr}}) {
2414                 $total += $errors{$addr}->{$error};
2415                 delete $errors{$addr}->{$error};
2416                 $types++;
2417             }
2418             if ($total > 1) {
2419                 my $message
2420                         = "A total of $total lines had errors in $file.  ";
2421
2422                 $message .= ($types == 1)
2423                             ? '(Only the first one was displayed.)'
2424                             : '(Only the first of each type was displayed.)';
2425                 Carp::my_carp($message);
2426             }
2427         }
2428
2429         if (@{$missings{$addr}}) {
2430             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2431         }
2432
2433         # If a real file handle, close it.
2434         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2435                                                         ref $handle{$addr};
2436         $handle{$addr} = "";   # Uses empty to indicate that has already seen
2437                                # the file, as opposed to undef
2438         return;
2439     }
2440
2441     sub next_line {
2442         # Sets $_ to be the next logical input line, if any.  Returns non-zero
2443         # if such a line exists.  'logical' means that any lines that have
2444         # been added via insert_lines() will be returned in $_ before the file
2445         # is read again.
2446
2447         my $self = shift;
2448         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2449
2450         my $addr = do { no overloading; pack 'J', $self; };
2451
2452         # Here the file is open (or if the handle is not a ref, is an open
2453         # 'virtual' file).  Get the next line; any inserted lines get priority
2454         # over the file itself.
2455         my $adjusted;
2456
2457         LINE:
2458         while (1) { # Loop until find non-comment, non-empty line
2459             #local $to_trace = 1 if main::DEBUG;
2460             my $inserted_ref = shift @{$added_lines{$addr}};
2461             if (defined $inserted_ref) {
2462                 ($adjusted, $_) = @{$inserted_ref};
2463                 trace $adjusted, $_ if main::DEBUG && $to_trace;
2464                 return 1 if $adjusted;
2465             }
2466             else {
2467                 last if ! ref $handle{$addr}; # Don't read unless is real file
2468                 last if ! defined ($_ = readline $handle{$addr});
2469             }
2470             chomp;
2471             trace $_ if main::DEBUG && $to_trace;
2472
2473             # See if this line is the comment line that defines what property
2474             # value that code points that are not listed in the file should
2475             # have.  The format or existence of these lines is not guaranteed
2476             # by Unicode since they are comments, but the documentation says
2477             # that this was added for machine-readability, so probably won't
2478             # change.  This works starting in Unicode Version 5.0.  They look
2479             # like:
2480             #
2481             # @missing: 0000..10FFFF; Not_Reordered
2482             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2483             # @missing: 0000..10FFFF; ; NaN
2484             #
2485             # Save the line for a later get_missings() call.
2486             if (/$missing_defaults_prefix/) {
2487                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2488                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2489                 }
2490                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2491                     my @defaults = split /\s* ; \s*/x, $_;
2492
2493                     # The first field is the @missing, which ends in a
2494                     # semi-colon, so can safely shift.
2495                     shift @defaults;
2496
2497                     # Some of these lines may have empty field placeholders
2498                     # which get in the way.  An example is:
2499                     # @missing: 0000..10FFFF; ; NaN
2500                     # Remove them.  Process starting from the top so the
2501                     # splice doesn't affect things still to be looked at.
2502                     for (my $i = @defaults - 1; $i >= 0; $i--) {
2503                         next if $defaults[$i] ne "";
2504                         splice @defaults, $i, 1;
2505                     }
2506
2507                     # What's left should be just the property (maybe) and the
2508                     # default.  Having only one element means it doesn't have
2509                     # the property.
2510                     my $default;
2511                     my $property;
2512                     if (@defaults >= 1) {
2513                         if (@defaults == 1) {
2514                             $default = $defaults[0];
2515                         }
2516                         else {
2517                             $property = $defaults[0];
2518                             $default = $defaults[1];
2519                         }
2520                     }
2521
2522                     if (@defaults < 1
2523                         || @defaults > 2
2524                         || ($default =~ /^</
2525                             && $default !~ /^<code *point>$/i
2526                             && $default !~ /^<none>$/i
2527                             && $default !~ /^<script>$/i))
2528                     {
2529                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2530                     }
2531                     else {
2532
2533                         # If the property is missing from the line, it should
2534                         # be the one for the whole file
2535                         $property = $property{$addr} if ! defined $property;
2536
2537                         # Change <none> to the null string, which is what it
2538                         # really means.  If the default is the code point
2539                         # itself, set it to <code point>, which is what
2540                         # Unicode uses (but sometimes they've forgotten the
2541                         # space)
2542                         if ($default =~ /^<none>$/i) {
2543                             $default = "";
2544                         }
2545                         elsif ($default =~ /^<code *point>$/i) {
2546                             $default = $CODE_POINT;
2547                         }
2548                         elsif ($default =~ /^<script>$/i) {
2549
2550                             # Special case this one.  Currently is from
2551                             # ScriptExtensions.txt, and means for all unlisted
2552                             # code points, use their Script property values.
2553                             # For the code points not listed in that file, the
2554                             # default value is 'Unknown'.
2555                             $default = "Unknown";
2556                         }
2557
2558                         # Store them as a sub-arrays with both components.
2559                         push @{$missings{$addr}}, [ $default, $property ];
2560                     }
2561                 }
2562
2563                 # There is nothing for the caller to process on this comment
2564                 # line.
2565                 next;
2566             }
2567
2568             # Remove comments and trailing space, and skip this line if the
2569             # result is empty
2570             s/#.*//;
2571             s/\s+$//;
2572             next if /^$/;
2573
2574             # Call any handlers for this line, and skip further processing of
2575             # the line if the handler sets the line to null.
2576             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2577                 &{$sub_ref}($self);
2578                 next LINE if /^$/;
2579             }
2580
2581             # Here the line is ok.  return success.
2582             return 1;
2583         } # End of looping through lines.
2584
2585         # If there is an EOF handler, call it (only once) and if it generates
2586         # more lines to process go back in the loop to handle them.
2587         if ($eof_handler{$addr}) {
2588             &{$eof_handler{$addr}}($self);
2589             $eof_handler{$addr} = "";   # Currently only get one shot at it.
2590             goto LINE if $added_lines{$addr};
2591         }
2592
2593         # Return failure -- no more lines.
2594         return 0;
2595
2596     }
2597
2598 #   Not currently used, not fully tested.
2599 #    sub peek {
2600 #        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2601 #        # record.  Not callable from an each_line_handler(), nor does it call
2602 #        # an each_line_handler() on the line.
2603 #
2604 #        my $self = shift;
2605 #        my $addr = do { no overloading; pack 'J', $self; };
2606 #
2607 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
2608 #            my ($adjusted, $line) = @{$inserted_ref};
2609 #            next if $adjusted;
2610 #
2611 #            # Remove comments and trailing space, and return a non-empty
2612 #            # resulting line
2613 #            $line =~ s/#.*//;
2614 #            $line =~ s/\s+$//;
2615 #            return $line if $line ne "";
2616 #        }
2617 #
2618 #        return if ! ref $handle{$addr}; # Don't read unless is real file
2619 #        while (1) { # Loop until find non-comment, non-empty line
2620 #            local $to_trace = 1 if main::DEBUG;
2621 #            trace $_ if main::DEBUG && $to_trace;
2622 #            return if ! defined (my $line = readline $handle{$addr});
2623 #            chomp $line;
2624 #            push @{$added_lines{$addr}}, [ 0, $line ];
2625 #
2626 #            $line =~ s/#.*//;
2627 #            $line =~ s/\s+$//;
2628 #            return $line if $line ne "";
2629 #        }
2630 #
2631 #        return;
2632 #    }
2633
2634
2635     sub insert_lines {
2636         # Lines can be inserted so that it looks like they were in the input
2637         # file at the place it was when this routine is called.  See also
2638         # insert_adjusted_lines().  Lines inserted via this routine go through
2639         # any each_line_handler()
2640
2641         my $self = shift;
2642
2643         # Each inserted line is an array, with the first element being 0 to
2644         # indicate that this line hasn't been adjusted, and needs to be
2645         # processed.
2646         no overloading;
2647         push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2648         return;
2649     }
2650
2651     sub insert_adjusted_lines {
2652         # Lines can be inserted so that it looks like they were in the input
2653         # file at the place it was when this routine is called.  See also
2654         # insert_lines().  Lines inserted via this routine are already fully
2655         # adjusted, ready to be processed; each_line_handler()s handlers will
2656         # not be called.  This means this is not a completely general
2657         # facility, as only the last each_line_handler on the stack should
2658         # call this.  It could be made more general, by passing to each of the
2659         # line_handlers their position on the stack, which they would pass on
2660         # to this routine, and that would replace the boolean first element in
2661         # the anonymous array pushed here, so that the next_line routine could
2662         # use that to call only those handlers whose index is after it on the
2663         # stack.  But this is overkill for what is needed now.
2664
2665         my $self = shift;
2666         trace $_[0] if main::DEBUG && $to_trace;
2667
2668         # Each inserted line is an array, with the first element being 1 to
2669         # indicate that this line has been adjusted
2670         no overloading;
2671         push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2672         return;
2673     }
2674
2675     sub get_missings {
2676         # Returns the stored up @missings lines' values, and clears the list.
2677         # The values are in an array, consisting of the default in the first
2678         # element, and the property in the 2nd.  However, since these lines
2679         # can be stacked up, the return is an array of all these arrays.
2680
2681         my $self = shift;
2682         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2683
2684         my $addr = do { no overloading; pack 'J', $self; };
2685
2686         # If not accepting a list return, just return the first one.
2687         return shift @{$missings{$addr}} unless wantarray;
2688
2689         my @return = @{$missings{$addr}};
2690         undef @{$missings{$addr}};
2691         return @return;
2692     }
2693
2694     sub _insert_property_into_line {
2695         # Add a property field to $_, if this file requires it.
2696
2697         my $self = shift;
2698         my $addr = do { no overloading; pack 'J', $self; };
2699         my $property = $property{$addr};
2700         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2701
2702         $_ =~ s/(;|$)/; $property$1/;
2703         return;
2704     }
2705
2706     sub carp_bad_line {
2707         # Output consistent error messages, using either a generic one, or the
2708         # one given by the optional parameter.  To avoid gazillions of the
2709         # same message in case the syntax of a  file is way off, this routine
2710         # only outputs the first instance of each message, incrementing a
2711         # count so the totals can be output at the end of the file.
2712
2713         my $self = shift;
2714         my $message = shift;
2715         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2716
2717         my $addr = do { no overloading; pack 'J', $self; };
2718
2719         $message = 'Unexpected line' unless $message;
2720
2721         # No trailing punctuation so as to fit with our addenda.
2722         $message =~ s/[.:;,]$//;
2723
2724         # If haven't seen this exact message before, output it now.  Otherwise
2725         # increment the count of how many times it has occurred
2726         unless ($errors{$addr}->{$message}) {
2727             Carp::my_carp("$message in '$_' in "
2728                             . $file{$addr}
2729                             . " at line $..  Skipping this line;");
2730             $errors{$addr}->{$message} = 1;
2731         }
2732         else {
2733             $errors{$addr}->{$message}++;
2734         }
2735
2736         # Clear the line to prevent any further (meaningful) processing of it.
2737         $_ = "";
2738
2739         return;
2740     }
2741 } # End closure
2742
2743 package Multi_Default;
2744
2745 # Certain properties in early versions of Unicode had more than one possible
2746 # default for code points missing from the files.  In these cases, one
2747 # default applies to everything left over after all the others are applied,
2748 # and for each of the others, there is a description of which class of code
2749 # points applies to it.  This object helps implement this by storing the
2750 # defaults, and for all but that final default, an eval string that generates
2751 # the class that it applies to.
2752
2753
2754 {   # Closure
2755
2756     main::setup_package();
2757
2758     my %class_defaults;
2759     # The defaults structure for the classes
2760     main::set_access('class_defaults', \%class_defaults);
2761
2762     my %other_default;
2763     # The default that applies to everything left over.
2764     main::set_access('other_default', \%other_default, 'r');
2765
2766
2767     sub new {
2768         # The constructor is called with default => eval pairs, terminated by
2769         # the left-over default. e.g.
2770         # Multi_Default->new(
2771         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2772         #               -  0x200D',
2773         #        'R' => 'some other expression that evaluates to code points',
2774         #        .
2775         #        .
2776         #        .
2777         #        'U'));
2778
2779         my $class = shift;
2780
2781         my $self = bless \do{my $anonymous_scalar}, $class;
2782         my $addr = do { no overloading; pack 'J', $self; };
2783
2784         while (@_ > 1) {
2785             my $default = shift;
2786             my $eval = shift;
2787             $class_defaults{$addr}->{$default} = $eval;
2788         }
2789
2790         $other_default{$addr} = shift;
2791
2792         return $self;
2793     }
2794
2795     sub get_next_defaults {
2796         # Iterates and returns the next class of defaults.
2797         my $self = shift;
2798         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2799
2800         my $addr = do { no overloading; pack 'J', $self; };
2801
2802         return each %{$class_defaults{$addr}};
2803     }
2804 }
2805
2806 package Alias;
2807
2808 # An alias is one of the names that a table goes by.  This class defines them
2809 # including some attributes.  Everything is currently setup in the
2810 # constructor.
2811
2812
2813 {   # Closure
2814
2815     main::setup_package();
2816
2817     my %name;
2818     main::set_access('name', \%name, 'r');
2819
2820     my %loose_match;
2821     # Should this name match loosely or not.
2822     main::set_access('loose_match', \%loose_match, 'r');
2823
2824     my %make_re_pod_entry;
2825     # Some aliases should not get their own entries in the re section of the
2826     # pod, because they are covered by a wild-card, and some we want to
2827     # discourage use of.  Binary
2828     main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
2829
2830     my %ucd;
2831     # Is this documented to be accessible via Unicode::UCD
2832     main::set_access('ucd', \%ucd, 'r', 's');
2833
2834     my %status;
2835     # Aliases have a status, like deprecated, or even suppressed (which means
2836     # they don't appear in documentation).  Enum
2837     main::set_access('status', \%status, 'r');
2838
2839     my %ok_as_filename;
2840     # Similarly, some aliases should not be considered as usable ones for
2841     # external use, such as file names, or we don't want documentation to
2842     # recommend them.  Boolean
2843     main::set_access('ok_as_filename', \%ok_as_filename, 'r');
2844
2845     sub new {
2846         my $class = shift;
2847
2848         my $self = bless \do { my $anonymous_scalar }, $class;
2849         my $addr = do { no overloading; pack 'J', $self; };
2850
2851         $name{$addr} = shift;
2852         $loose_match{$addr} = shift;
2853         $make_re_pod_entry{$addr} = shift;
2854         $ok_as_filename{$addr} = shift;
2855         $status{$addr} = shift;
2856         $ucd{$addr} = shift;
2857
2858         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2859
2860         # Null names are never ok externally
2861         $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
2862
2863         return $self;
2864     }
2865 }
2866
2867 package Range;
2868
2869 # A range is the basic unit for storing code points, and is described in the
2870 # comments at the beginning of the program.  Each range has a starting code
2871 # point; an ending code point (not less than the starting one); a value
2872 # that applies to every code point in between the two end-points, inclusive;
2873 # and an enum type that applies to the value.  The type is for the user's
2874 # convenience, and has no meaning here, except that a non-zero type is
2875 # considered to not obey the normal Unicode rules for having standard forms.
2876 #
2877 # The same structure is used for both map and match tables, even though in the
2878 # latter, the value (and hence type) is irrelevant and could be used as a
2879 # comment.  In map tables, the value is what all the code points in the range
2880 # map to.  Type 0 values have the standardized version of the value stored as
2881 # well, so as to not have to recalculate it a lot.
2882
2883 sub trace { return main::trace(@_); }
2884
2885 {   # Closure
2886
2887     main::setup_package();
2888
2889     my %start;
2890     main::set_access('start', \%start, 'r', 's');
2891
2892     my %end;
2893     main::set_access('end', \%end, 'r', 's');
2894
2895     my %value;
2896     main::set_access('value', \%value, 'r');
2897
2898     my %type;
2899     main::set_access('type', \%type, 'r');
2900
2901     my %standard_form;
2902     # The value in internal standard form.  Defined only if the type is 0.
2903     main::set_access('standard_form', \%standard_form);
2904
2905     # Note that if these fields change, the dump() method should as well
2906
2907     sub new {
2908         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2909         my $class = shift;
2910
2911         my $self = bless \do { my $anonymous_scalar }, $class;
2912         my $addr = do { no overloading; pack 'J', $self; };
2913
2914         $start{$addr} = shift;
2915         $end{$addr} = shift;
2916
2917         my %args = @_;
2918
2919         my $value = delete $args{'Value'};  # Can be 0
2920         $value = "" unless defined $value;
2921         $value{$addr} = $value;
2922
2923         $type{$addr} = delete $args{'Type'} || 0;
2924
2925         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2926
2927         return $self;
2928     }
2929
2930     use overload
2931         fallback => 0,
2932         qw("") => "_operator_stringify",
2933         "." => \&main::_operator_dot,
2934         ".=" => \&main::_operator_dot_equal,
2935     ;
2936
2937     sub _operator_stringify {
2938         my $self = shift;
2939         my $addr = do { no overloading; pack 'J', $self; };
2940
2941         # Output it like '0041..0065 (value)'
2942         my $return = sprintf("%04X", $start{$addr})
2943                         .  '..'
2944                         . sprintf("%04X", $end{$addr});
2945         my $value = $value{$addr};
2946         my $type = $type{$addr};
2947         $return .= ' (';
2948         $return .= "$value";
2949         $return .= ", Type=$type" if $type != 0;
2950         $return .= ')';
2951
2952         return $return;
2953     }
2954
2955     sub standard_form {
2956         # Calculate the standard form only if needed, and cache the result.
2957         # The standard form is the value itself if the type is special.
2958         # This represents a considerable CPU and memory saving - at the time
2959         # of writing there are 368676 non-special objects, but the standard
2960         # form is only requested for 22047 of them - ie about 6%.
2961
2962         my $self = shift;
2963         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2964
2965         my $addr = do { no overloading; pack 'J', $self; };
2966
2967         return $standard_form{$addr} if defined $standard_form{$addr};
2968
2969         my $value = $value{$addr};
2970         return $value if $type{$addr};
2971         return $standard_form{$addr} = main::standardize($value);
2972     }
2973
2974     sub dump {
2975         # Human, not machine readable.  For machine readable, comment out this
2976         # entire routine and let the standard one take effect.
2977         my $self = shift;
2978         my $indent = shift;
2979         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2980
2981         my $addr = do { no overloading; pack 'J', $self; };
2982
2983         my $return = $indent
2984                     . sprintf("%04X", $start{$addr})
2985                     . '..'
2986                     . sprintf("%04X", $end{$addr})
2987                     . " '$value{$addr}';";
2988         if (! defined $standard_form{$addr}) {
2989             $return .= "(type=$type{$addr})";
2990         }
2991         elsif ($standard_form{$addr} ne $value{$addr}) {
2992             $return .= "(standard '$standard_form{$addr}')";
2993         }
2994         return $return;
2995     }
2996 } # End closure
2997
2998 package _Range_List_Base;
2999
3000 # Base class for range lists.  A range list is simply an ordered list of
3001 # ranges, so that the ranges with the lowest starting numbers are first in it.
3002 #
3003 # When a new range is added that is adjacent to an existing range that has the
3004 # same value and type, it merges with it to form a larger range.
3005 #
3006 # Ranges generally do not overlap, except that there can be multiple entries
3007 # of single code point ranges.  This is because of NameAliases.txt.
3008 #
3009 # In this program, there is a standard value such that if two different
3010 # values, have the same standard value, they are considered equivalent.  This
3011 # value was chosen so that it gives correct results on Unicode data
3012
3013 # There are a number of methods to manipulate range lists, and some operators
3014 # are overloaded to handle them.
3015
3016 sub trace { return main::trace(@_); }
3017
3018 { # Closure
3019
3020     our $addr;
3021
3022     # Max is initialized to a negative value that isn't adjacent to 0, for
3023     # simpler tests
3024     my $max_init = -2;
3025
3026     main::setup_package();
3027
3028     my %ranges;
3029     # The list of ranges
3030     main::set_access('ranges', \%ranges, 'readable_array');
3031
3032     my %max;
3033     # The highest code point in the list.  This was originally a method, but
3034     # actual measurements said it was used a lot.
3035     main::set_access('max', \%max, 'r');
3036
3037     my %each_range_iterator;
3038     # Iterator position for each_range()
3039     main::set_access('each_range_iterator', \%each_range_iterator);
3040
3041     my %owner_name_of;
3042     # Name of parent this is attached to, if any.  Solely for better error
3043     # messages.
3044     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3045
3046     my %_search_ranges_cache;
3047     # A cache of the previous result from _search_ranges(), for better
3048     # performance
3049     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3050
3051     sub new {
3052         my $class = shift;
3053         my %args = @_;
3054
3055         # Optional initialization data for the range list.
3056         my $initialize = delete $args{'Initialize'};
3057
3058         my $self;
3059
3060         # Use _union() to initialize.  _union() returns an object of this
3061         # class, which means that it will call this constructor recursively.
3062         # But it won't have this $initialize parameter so that it won't
3063         # infinitely loop on this.
3064         return _union($class, $initialize, %args) if defined $initialize;
3065
3066         $self = bless \do { my $anonymous_scalar }, $class;
3067         my $addr = do { no overloading; pack 'J', $self; };
3068
3069         # Optional parent object, only for debug info.
3070         $owner_name_of{$addr} = delete $args{'Owner'};
3071         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3072
3073         # Stringify, in case it is an object.
3074         $owner_name_of{$addr} = "$owner_name_of{$addr}";
3075
3076         # This is used only for error messages, and so a colon is added
3077         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3078
3079         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3080
3081         $max{$addr} = $max_init;
3082
3083         $_search_ranges_cache{$addr} = 0;
3084         $ranges{$addr} = [];
3085
3086         return $self;
3087     }
3088
3089     use overload
3090         fallback => 0,
3091         qw("") => "_operator_stringify",
3092         "." => \&main::_operator_dot,
3093         ".=" => \&main::_operator_dot_equal,
3094     ;
3095
3096     sub _operator_stringify {
3097         my $self = shift;
3098         my $addr = do { no overloading; pack 'J', $self; };
3099
3100         return "Range_List attached to '$owner_name_of{$addr}'"
3101                                                 if $owner_name_of{$addr};
3102         return "anonymous Range_List " . \$self;
3103     }
3104
3105     sub _union {
3106         # Returns the union of the input code points.  It can be called as
3107         # either a constructor or a method.  If called as a method, the result
3108         # will be a new() instance of the calling object, containing the union
3109         # of that object with the other parameter's code points;  if called as
3110         # a constructor, the first parameter gives the class that the new object
3111         # should be, and the second parameter gives the code points to go into
3112         # it.
3113         # In either case, there are two parameters looked at by this routine;
3114         # any additional parameters are passed to the new() constructor.
3115         #
3116         # The code points can come in the form of some object that contains
3117         # ranges, and has a conventionally named method to access them; or
3118         # they can be an array of individual code points (as integers); or
3119         # just a single code point.
3120         #
3121         # If they are ranges, this routine doesn't make any effort to preserve
3122         # the range values and types of one input over the other.  Therefore
3123         # this base class should not allow _union to be called from other than
3124         # initialization code, so as to prevent two tables from being added
3125         # together where the range values matter.  The general form of this
3126         # routine therefore belongs in a derived class, but it was moved here
3127         # to avoid duplication of code.  The failure to overload this in this
3128         # class keeps it safe.
3129         #
3130         # It does make the effort during initialization to accept tables with
3131         # multiple values for the same code point, and to preserve the order
3132         # of these.  If there is only one input range or range set, it doesn't
3133         # sort (as it should already be sorted to the desired order), and will
3134         # accept multiple values per code point.  Otherwise it will merge
3135         # multiple values into a single one.
3136
3137         my $self;
3138         my @args;   # Arguments to pass to the constructor
3139
3140         my $class = shift;
3141
3142         # If a method call, will start the union with the object itself, and
3143         # the class of the new object will be the same as self.
3144         if (ref $class) {
3145             $self = $class;
3146             $class = ref $self;
3147             push @args, $self;
3148         }
3149
3150         # Add the other required parameter.
3151         push @args, shift;
3152         # Rest of parameters are passed on to the constructor
3153
3154         # Accumulate all records from both lists.
3155         my @records;
3156         my $input_count = 0;
3157         for my $arg (@args) {
3158             #local $to_trace = 0 if main::DEBUG;
3159             trace "argument = $arg" if main::DEBUG && $to_trace;
3160             if (! defined $arg) {
3161                 my $message = "";
3162                 if (defined $self) {
3163                     no overloading;
3164                     $message .= $owner_name_of{pack 'J', $self};
3165                 }
3166                 Carp::my_carp_bug($message . "Undefined argument to _union.  No union done.");
3167                 return;
3168             }
3169
3170             $arg = [ $arg ] if ! ref $arg;
3171             my $type = ref $arg;
3172             if ($type eq 'ARRAY') {
3173                 foreach my $element (@$arg) {
3174                     push @records, Range->new($element, $element);
3175                     $input_count++;
3176                 }
3177             }
3178             elsif ($arg->isa('Range')) {
3179                 push @records, $arg;
3180                 $input_count++;
3181             }
3182             elsif ($arg->can('ranges')) {
3183                 push @records, $arg->ranges;
3184                 $input_count++;
3185             }
3186             else {
3187                 my $message = "";
3188                 if (defined $self) {
3189                     no overloading;
3190                     $message .= $owner_name_of{pack 'J', $self};
3191                 }
3192                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3193                 return;
3194             }
3195         }
3196
3197         # Sort with the range containing the lowest ordinal first, but if
3198         # two ranges start at the same code point, sort with the bigger range
3199         # of the two first, because it takes fewer cycles.
3200         if ($input_count > 1) {
3201             @records = sort { ($a->start <=> $b->start)
3202                                       or
3203                                     # if b is shorter than a, b->end will be
3204                                     # less than a->end, and we want to select
3205                                     # a, so want to return -1
3206                                     ($b->end <=> $a->end)
3207                                    } @records;
3208         }
3209
3210         my $new = $class->new(@_);
3211
3212         # Fold in records so long as they add new information.
3213         for my $set (@records) {
3214             my $start = $set->start;
3215             my $end   = $set->end;
3216             my $value = $set->value;
3217             my $type  = $set->type;
3218             if ($start > $new->max) {
3219                 $new->_add_delete('+', $start, $end, $value, Type => $type);
3220             }
3221             elsif ($end > $new->max) {
3222                 $new->_add_delete('+', $new->max +1, $end, $value,
3223                                                                 Type => $type);
3224             }
3225             elsif ($input_count == 1) {
3226                 # Here, overlaps existing range, but is from a single input,
3227                 # so preserve the multiple values from that input.
3228                 $new->_add_delete('+', $start, $end, $value, Type => $type,
3229                                                 Replace => $MULTIPLE_AFTER);
3230             }
3231         }
3232
3233         return $new;
3234     }
3235
3236     sub range_count {        # Return the number of ranges in the range list
3237         my $self = shift;
3238         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3239
3240         no overloading;
3241         return scalar @{$ranges{pack 'J', $self}};
3242     }
3243
3244     sub min {
3245         # Returns the minimum code point currently in the range list, or if
3246         # the range list is empty, 2 beyond the max possible.  This is a
3247         # method because used so rarely, that not worth saving between calls,
3248         # and having to worry about changing it as ranges are added and
3249         # deleted.
3250
3251         my $self = shift;
3252         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3253
3254         my $addr = do { no overloading; pack 'J', $self; };
3255
3256         # If the range list is empty, return a large value that isn't adjacent
3257         # to any that could be in the range list, for simpler tests
3258         return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3259         return $ranges{$addr}->[0]->start;
3260     }
3261
3262     sub contains {
3263         # Boolean: Is argument in the range list?  If so returns $i such that:
3264         #   range[$i]->end < $codepoint <= range[$i+1]->end
3265         # which is one beyond what you want; this is so that the 0th range
3266         # doesn't return false
3267         my $self = shift;
3268         my $codepoint = shift;
3269         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3270
3271         my $i = $self->_search_ranges($codepoint);
3272         return 0 unless defined $i;
3273
3274         # The search returns $i, such that
3275         #   range[$i-1]->end < $codepoint <= range[$i]->end
3276         # So is in the table if and only iff it is at least the start position
3277         # of range $i.
3278         no overloading;
3279         return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3280         return $i + 1;
3281     }
3282
3283     sub containing_range {
3284         # Returns the range object that contains the code point, undef if none
3285
3286         my $self = shift;
3287         my $codepoint = shift;
3288         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3289
3290         my $i = $self->contains($codepoint);
3291         return unless $i;
3292
3293         # contains() returns 1 beyond where we should look
3294         no overloading;
3295         return $ranges{pack 'J', $self}->[$i-1];
3296     }
3297
3298     sub value_of {
3299         # Returns the value associated with the code point, undef if none
3300
3301         my $self = shift;
3302         my $codepoint = shift;
3303         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3304
3305         my $range = $self->containing_range($codepoint);
3306         return unless defined $range;
3307
3308         return $range->value;
3309     }
3310
3311     sub type_of {
3312         # Returns the type of the range containing the code point, undef if
3313         # the code point is not in the table
3314
3315         my $self = shift;
3316         my $codepoint = shift;
3317         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3318
3319         my $range = $self->containing_range($codepoint);
3320         return unless defined $range;
3321
3322         return $range->type;
3323     }
3324
3325     sub _search_ranges {
3326         # Find the range in the list which contains a code point, or where it
3327         # should go if were to add it.  That is, it returns $i, such that:
3328         #   range[$i-1]->end < $codepoint <= range[$i]->end
3329         # Returns undef if no such $i is possible (e.g. at end of table), or
3330         # if there is an error.
3331
3332         my $self = shift;
3333         my $code_point = shift;
3334         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3335
3336         my $addr = do { no overloading; pack 'J', $self; };
3337
3338         return if $code_point > $max{$addr};
3339         my $r = $ranges{$addr};                # The current list of ranges
3340         my $range_list_size = scalar @$r;
3341         my $i;
3342
3343         use integer;        # want integer division
3344
3345         # Use the cached result as the starting guess for this one, because,
3346         # an experiment on 5.1 showed that 90% of the time the cache was the
3347         # same as the result on the next call (and 7% it was one less).
3348         $i = $_search_ranges_cache{$addr};
3349         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3350                                             # from an intervening deletion
3351         #local $to_trace = 1 if main::DEBUG;
3352         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3353         return $i if $code_point <= $r->[$i]->end
3354                      && ($i == 0 || $r->[$i-1]->end < $code_point);
3355
3356         # Here the cache doesn't yield the correct $i.  Try adding 1.
3357         if ($i < $range_list_size - 1
3358             && $r->[$i]->end < $code_point &&
3359             $code_point <= $r->[$i+1]->end)
3360         {
3361             $i++;
3362             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3363             $_search_ranges_cache{$addr} = $i;
3364             return $i;
3365         }
3366
3367         # Here, adding 1 also didn't work.  We do a binary search to
3368         # find the correct position, starting with current $i
3369         my $lower = 0;
3370         my $upper = $range_list_size - 1;
3371         while (1) {
3372             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3373
3374             if ($code_point <= $r->[$i]->end) {
3375
3376                 # Here we have met the upper constraint.  We can quit if we
3377                 # also meet the lower one.
3378                 last if $i == 0 || $r->[$i-1]->end < $code_point;
3379
3380                 $upper = $i;        # Still too high.
3381
3382             }
3383             else {
3384
3385                 # Here, $r[$i]->end < $code_point, so look higher up.
3386                 $lower = $i;
3387             }
3388
3389             # Split search domain in half to try again.
3390             my $temp = ($upper + $lower) / 2;
3391
3392             # No point in continuing unless $i changes for next time
3393             # in the loop.
3394             if ($temp == $i) {
3395
3396                 # We can't reach the highest element because of the averaging.
3397                 # So if one below the upper edge, force it there and try one
3398                 # more time.
3399                 if ($i == $range_list_size - 2) {
3400
3401                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3402                     $i = $range_list_size - 1;
3403
3404                     # Change $lower as well so if fails next time through,
3405                     # taking the average will yield the same $i, and we will
3406                     # quit with the error message just below.
3407                     $lower = $i;
3408                     next;
3409                 }
3410                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
3411                 return;
3412             }
3413             $i = $temp;
3414         } # End of while loop
3415
3416         if (main::DEBUG && $to_trace) {
3417             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3418             trace "i=  [ $i ]", $r->[$i];
3419             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3420         }
3421
3422         # Here we have found the offset.  Cache it as a starting point for the
3423         # next call.
3424         $_search_ranges_cache{$addr} = $i;
3425         return $i;
3426     }
3427
3428     sub _add_delete {
3429         # Add, replace or delete ranges to or from a list.  The $type
3430         # parameter gives which:
3431         #   '+' => insert or replace a range, returning a list of any changed
3432         #          ranges.
3433         #   '-' => delete a range, returning a list of any deleted ranges.
3434         #
3435         # The next three parameters give respectively the start, end, and
3436         # value associated with the range.  'value' should be null unless the
3437         # operation is '+';
3438         #
3439         # The range list is kept sorted so that the range with the lowest
3440         # starting position is first in the list, and generally, adjacent
3441         # ranges with the same values are merged into a single larger one (see
3442         # exceptions below).
3443         #
3444         # There are more parameters; all are key => value pairs:
3445         #   Type    gives the type of the value.  It is only valid for '+'.
3446         #           All ranges have types; if this parameter is omitted, 0 is
3447         #           assumed.  Ranges with type 0 are assumed to obey the
3448         #           Unicode rules for casing, etc; ranges with other types are
3449         #           not.  Otherwise, the type is arbitrary, for the caller's
3450         #           convenience, and looked at only by this routine to keep
3451         #           adjacent ranges of different types from being merged into
3452         #           a single larger range, and when Replace =>
3453         #           $IF_NOT_EQUIVALENT is specified (see just below).
3454         #   Replace  determines what to do if the range list already contains
3455         #            ranges which coincide with all or portions of the input
3456         #            range.  It is only valid for '+':
3457         #       => $NO            means that the new value is not to replace
3458         #                         any existing ones, but any empty gaps of the
3459         #                         range list coinciding with the input range
3460         #                         will be filled in with the new value.
3461         #       => $UNCONDITIONALLY  means to replace the existing values with
3462         #                         this one unconditionally.  However, if the
3463         #                         new and old values are identical, the
3464         #                         replacement is skipped to save cycles
3465         #       => $IF_NOT_EQUIVALENT means to replace the existing values
3466         #          (the default)  with this one if they are not equivalent.
3467         #                         Ranges are equivalent if their types are the
3468         #                         same, and they are the same string; or if
3469         #                         both are type 0 ranges, if their Unicode
3470         #                         standard forms are identical.  In this last
3471         #                         case, the routine chooses the more "modern"
3472         #                         one to use.  This is because some of the
3473         #                         older files are formatted with values that
3474         #                         are, for example, ALL CAPs, whereas the
3475         #                         derived files have a more modern style,
3476         #                         which looks better.  By looking for this
3477         #                         style when the pre-existing and replacement
3478         #                         standard forms are the same, we can move to
3479         #                         the modern style
3480         #       => $MULTIPLE_BEFORE means that if this range duplicates an
3481         #                         existing one, but has a different value,
3482         #                         don't replace the existing one, but insert
3483         #                         this, one so that the same range can occur
3484         #                         multiple times.  They are stored LIFO, so
3485         #                         that the final one inserted is the first one
3486         #                         returned in an ordered search of the table.
3487         #                         If this is an exact duplicate, including the
3488         #                         value, the original will be moved to be
3489         #                         first, before any other duplicate ranges
3490         #                         with different values.
3491         #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3492         #                         FIFO, so that this one is inserted after all
3493         #                         others that currently exist.  If this is an
3494         #                         exact duplicate, including value, of an
3495         #                         existing range, this one is discarded
3496         #                         (leaving the existing one in its original,
3497         #                         higher priority position
3498         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
3499         #
3500         # "same value" means identical for non-type-0 ranges, and it means
3501         # having the same standard forms for type-0 ranges.
3502
3503         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3504
3505         my $self = shift;
3506         my $operation = shift;   # '+' for add/replace; '-' for delete;
3507         my $start = shift;
3508         my $end   = shift;
3509         my $value = shift;
3510
3511         my %args = @_;
3512
3513         $value = "" if not defined $value;        # warning: $value can be "0"
3514
3515         my $replace = delete $args{'Replace'};
3516         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3517
3518         my $type = delete $args{'Type'};
3519         $type = 0 unless defined $type;
3520
3521         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3522
3523         my $addr = do { no overloading; pack 'J', $self; };
3524
3525         if ($operation ne '+' && $operation ne '-') {
3526             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
3527             return;
3528         }
3529         unless (defined $start && defined $end) {
3530             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
3531             return;
3532         }
3533         unless ($end >= $start) {
3534             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
3535             return;
3536         }
3537         if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
3538             Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ".  Adding it anyway");
3539         }
3540         #local $to_trace = 1 if main::DEBUG;
3541
3542         if ($operation eq '-') {
3543             if ($replace != $IF_NOT_EQUIVALENT) {
3544                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
3545                 $replace = $IF_NOT_EQUIVALENT;
3546             }
3547             if ($type) {
3548                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
3549                 $type = 0;
3550             }
3551             if ($value ne "") {
3552                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
3553                 $value = "";
3554             }
3555         }
3556
3557         my $r = $ranges{$addr};               # The current list of ranges
3558         my $range_list_size = scalar @$r;     # And its size
3559         my $max = $max{$addr};                # The current high code point in
3560                                               # the list of ranges
3561
3562         # Do a special case requiring fewer machine cycles when the new range
3563         # starts after the current highest point.  The Unicode input data is
3564         # structured so this is common.
3565         if ($start > $max) {
3566
3567             trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
3568             return if $operation eq '-'; # Deleting a non-existing range is a
3569                                          # no-op
3570
3571             # If the new range doesn't logically extend the current final one
3572             # in the range list, create a new range at the end of the range
3573             # list.  (max cleverly is initialized to a negative number not
3574             # adjacent to 0 if the range list is empty, so even adding a range
3575             # to an empty range list starting at 0 will have this 'if'
3576             # succeed.)
3577             if ($start > $max + 1        # non-adjacent means can't extend.
3578                 || @{$r}[-1]->value ne $value # values differ, can't extend.
3579                 || @{$r}[-1]->type != $type # types differ, can't extend.
3580             ) {
3581                 push @$r, Range->new($start, $end,
3582                                      Value => $value,
3583                                      Type => $type);
3584             }
3585             else {
3586
3587                 # Here, the new range starts just after the current highest in
3588                 # the range list, and they have the same type and value.
3589                 # Extend the current range to incorporate the new one.
3590                 @{$r}[-1]->set_end($end);
3591             }
3592
3593             # This becomes the new maximum.
3594             $max{$addr} = $end;
3595
3596             return;
3597         }
3598         #local $to_trace = 0 if main::DEBUG;
3599
3600         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3601
3602         # Here, the input range isn't after the whole rest of the range list.
3603         # Most likely 'splice' will be needed.  The rest of the routine finds
3604         # the needed splice parameters, and if necessary, does the splice.
3605         # First, find the offset parameter needed by the splice function for
3606         # the input range.  Note that the input range may span multiple
3607         # existing ones, but we'll worry about that later.  For now, just find
3608         # the beginning.  If the input range is to be inserted starting in a
3609         # position not currently in the range list, it must (obviously) come
3610         # just after the range below it, and just before the range above it.
3611         # Slightly less obviously, it will occupy the position currently
3612         # occupied by the range that is to come after it.  More formally, we
3613         # are looking for the position, $i, in the array of ranges, such that:
3614         #
3615         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3616         #
3617         # (The ordered relationships within existing ranges are also shown in
3618         # the equation above).  However, if the start of the input range is
3619         # within an existing range, the splice offset should point to that
3620         # existing range's position in the list; that is $i satisfies a
3621         # somewhat different equation, namely:
3622         #
3623         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3624         #
3625         # More briefly, $start can come before or after r[$i]->start, and at
3626         # this point, we don't know which it will be.  However, these
3627         # two equations share these constraints:
3628         #
3629         #   r[$i-1]->end < $start <= r[$i]->end
3630         #
3631         # And that is good enough to find $i.
3632
3633         my $i = $self->_search_ranges($start);
3634         if (! defined $i) {
3635             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
3636             return;
3637         }
3638
3639         # The search function returns $i such that:
3640         #
3641         # r[$i-1]->end < $start <= r[$i]->end
3642         #
3643         # That means that $i points to the first range in the range list
3644         # that could possibly be affected by this operation.  We still don't
3645         # know if the start of the input range is within r[$i], or if it
3646         # points to empty space between r[$i-1] and r[$i].
3647         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3648
3649         # Special case the insertion of data that is not to replace any
3650         # existing data.
3651         if ($replace == $NO) {  # If $NO, has to be operation '+'
3652             #local $to_trace = 1 if main::DEBUG;
3653             trace "Doesn't replace" if main::DEBUG && $to_trace;
3654
3655             # Here, the new range is to take effect only on those code points
3656             # that aren't already in an existing range.  This can be done by
3657             # looking through the existing range list and finding the gaps in
3658             # the ranges that this new range affects, and then calling this
3659             # function recursively on each of those gaps, leaving untouched
3660             # anything already in the list.  Gather up a list of the changed
3661             # gaps first so that changes to the internal state as new ranges
3662             # are added won't be a problem.
3663             my @gap_list;
3664
3665             # First, if the starting point of the input range is outside an
3666             # existing one, there is a gap from there to the beginning of the
3667             # existing range -- add a span to fill the part that this new
3668             # range occupies
3669             if ($start < $r->[$i]->start) {
3670                 push @gap_list, Range->new($start,
3671                                            main::min($end,
3672                                                      $r->[$i]->start - 1),
3673                                            Type => $type);
3674                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3675             }
3676
3677             # Then look through the range list for other gaps until we reach
3678             # the highest range affected by the input one.
3679             my $j;
3680             for ($j = $i+1; $j < $range_list_size; $j++) {
3681                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3682                 last if $end < $r->[$j]->start;
3683
3684                 # If there is a gap between when this range starts and the
3685                 # previous one ends, add a span to fill it.  Note that just
3686                 # because there are two ranges doesn't mean there is a
3687                 # non-zero gap between them.  It could be that they have
3688                 # different values or types
3689                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3690                     push @gap_list,
3691                         Range->new($r->[$j-1]->end + 1,
3692                                    $r->[$j]->start - 1,
3693                                    Type => $type);
3694                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3695                 }
3696             }
3697
3698             # Here, we have either found an existing range in the range list,
3699             # beyond the area affected by the input one, or we fell off the
3700             # end of the loop because the input range affects the whole rest
3701             # of the range list.  In either case, $j is 1 higher than the
3702             # highest affected range.  If $j == $i, it means that there are no
3703             # affected ranges, that the entire insertion is in the gap between
3704             # r[$i-1], and r[$i], which we already have taken care of before
3705             # the loop.
3706             # On the other hand, if there are affected ranges, it might be
3707             # that there is a gap that needs filling after the final such
3708             # range to the end of the input range
3709             if ($r->[$j-1]->end < $end) {
3710                     push @gap_list, Range->new(main::max($start,
3711                                                          $r->[$j-1]->end + 1),
3712                                                $end,
3713                                                Type => $type);
3714                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3715             }
3716
3717             # Call recursively to fill in all the gaps.
3718             foreach my $gap (@gap_list) {
3719                 $self->_add_delete($operation,
3720                                    $gap->start,
3721                                    $gap->end,
3722                                    $value,
3723                                    Type => $type);
3724             }
3725
3726             return;
3727         }
3728
3729         # Here, we have taken care of the case where $replace is $NO.
3730         # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3731         # If inserting a multiple record, this is where it goes, before the
3732         # first (if any) existing one if inserting LIFO.  (If this is to go
3733         # afterwards, FIFO, we below move the pointer to there.)  These imply
3734         # an insertion, and no change to any existing ranges.  Note that $i
3735         # can be -1 if this new range doesn't actually duplicate any existing,
3736         # and comes at the beginning of the list.
3737         if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
3738
3739             if ($start != $end) {
3740                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
3741                 return;
3742             }
3743
3744             # If the new code point is within a current range ...
3745             if ($end >= $r->[$i]->start) {
3746
3747                 # Don't add an exact duplicate, as it isn't really a multiple
3748                 my $existing_value = $r->[$i]->value;
3749                 my $existing_type = $r->[$i]->type;
3750                 return if $value eq $existing_value && $type eq $existing_type;
3751
3752                 # If the multiple value is part of an existing range, we want
3753                 # to split up that range, so that only the single code point
3754                 # is affected.  To do this, we first call ourselves
3755                 # recursively to delete that code point from the table, having
3756                 # preserved its current data above.  Then we call ourselves
3757                 # recursively again to add the new multiple, which we know by
3758                 # the test just above is different than the current code
3759                 # point's value, so it will become a range containing a single
3760                 # code point: just itself.  Finally, we add back in the
3761                 # pre-existing code point, which will again be a single code
3762                 # point range.  Because 'i' likely will have changed as a
3763                 # result of these operations, we can't just continue on, but
3764                 # do this operation recursively as well.  If we are inserting
3765                 # LIFO, the pre-existing code point needs to go after the new
3766                 # one, so use MULTIPLE_AFTER; and vice versa.
3767                 if ($r->[$i]->start != $r->[$i]->end) {
3768                     $self->_add_delete('-', $start, $end, "");
3769                     $self->_add_delete('+', $start, $end, $value, Type => $type);
3770                     return $self->_add_delete('+',
3771                             $start, $end,
3772                             $existing_value,
3773                             Type => $existing_type,
3774                             Replace => ($replace == $MULTIPLE_BEFORE)
3775                                        ? $MULTIPLE_AFTER
3776                                        : $MULTIPLE_BEFORE);
3777                 }
3778             }
3779
3780             # If to place this new record after, move to beyond all existing
3781             # ones; but don't add this one if identical to any of them, as it
3782             # isn't really a multiple.  This leaves the original order, so
3783             # that the current request is ignored.  The reasoning is that the
3784             # previous request that wanted this record to have high priority
3785             # should have precedence.
3786             if ($replace == $MULTIPLE_AFTER) {
3787                 while ($i < @$r && $r->[$i]->start == $start) {
3788                     return if $value eq $r->[$i]->value
3789                               && $type eq $r->[$i]->type;
3790                     $i++;
3791                 }
3792             }
3793             else {
3794                 # If instead we are to place this new record before any
3795                 # existing ones, remove any identical ones that come after it.
3796                 # This changes the existing order so that the new one is
3797                 # first, as is being requested.
3798                 for (my $j = $i + 1;
3799                      $j < @$r && $r->[$j]->start == $start;
3800                      $j++)
3801                 {
3802                     if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
3803                         splice @$r, $j, 1;
3804                         last;   # There should only be one instance, so no
3805                                 # need to keep looking
3806                     }
3807                 }
3808             }
3809
3810             trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3811             my @return = splice @$r,
3812                                 $i,
3813                                 0,
3814                                 Range->new($start,
3815                                            $end,
3816                                            Value => $value,
3817                                            Type => $type);
3818             if (main::DEBUG && $to_trace) {
3819                 trace "After splice:";
3820                 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3821                 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3822                 trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
3823                 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3824                 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3825                 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3826             }
3827             return @return;
3828         }
3829
3830         # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
3831         # leaves delete, insert, and replace either unconditionally or if not
3832         # equivalent.  $i still points to the first potential affected range.
3833         # Now find the highest range affected, which will determine the length
3834         # parameter to splice.  (The input range can span multiple existing
3835         # ones.)  If this isn't a deletion, while we are looking through the
3836         # range list, see also if this is a replacement rather than a clean
3837         # insertion; that is if it will change the values of at least one
3838         # existing range.  Start off assuming it is an insert, until find it
3839         # isn't.
3840         my $clean_insert = $operation eq '+';
3841         my $j;        # This will point to the highest affected range
3842
3843         # For non-zero types, the standard form is the value itself;
3844         my $standard_form = ($type) ? $value : main::standardize($value);
3845
3846         for ($j = $i; $j < $range_list_size; $j++) {
3847             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3848
3849             # If find a range that it doesn't overlap into, we can stop
3850             # searching
3851             last if $end < $r->[$j]->start;
3852
3853             # Here, overlaps the range at $j.  If the values don't match,
3854             # and so far we think this is a clean insertion, it becomes a
3855             # non-clean insertion, i.e., a 'change' or 'replace' instead.
3856             if ($clean_insert) {
3857                 if ($r->[$j]->standard_form ne $standard_form) {
3858                     $clean_insert = 0;
3859                     if ($replace == $CROAK) {
3860                         main::croak("The range to add "
3861                         . sprintf("%04X", $start)
3862                         . '-'
3863                         . sprintf("%04X", $end)
3864                         . " with value '$value' overlaps an existing range $r->[$j]");
3865                     }
3866                 }
3867                 else {
3868
3869                     # Here, the two values are essentially the same.  If the
3870                     # two are actually identical, replacing wouldn't change
3871                     # anything so skip it.
3872                     my $pre_existing = $r->[$j]->value;
3873                     if ($pre_existing ne $value) {
3874
3875                         # Here the new and old standardized values are the
3876                         # same, but the non-standardized values aren't.  If
3877                         # replacing unconditionally, then replace
3878                         if( $replace == $UNCONDITIONALLY) {
3879                             $clean_insert = 0;
3880                         }
3881                         else {
3882
3883                             # Here, are replacing conditionally.  Decide to
3884                             # replace or not based on which appears to look
3885                             # the "nicest".  If one is mixed case and the
3886                             # other isn't, choose the mixed case one.
3887                             my $new_mixed = $value =~ /[A-Z]/
3888                                             && $value =~ /[a-z]/;
3889                             my $old_mixed = $pre_existing =~ /[A-Z]/
3890                                             && $pre_existing =~ /[a-z]/;
3891
3892                             if ($old_mixed != $new_mixed) {
3893                                 $clean_insert = 0 if $new_mixed;
3894                                 if (main::DEBUG && $to_trace) {
3895                                     if ($clean_insert) {
3896                                         trace "Retaining $pre_existing over $value";
3897                                     }
3898                                     else {
3899                                         trace "Replacing $pre_existing with $value";
3900                                     }
3901                                 }
3902                             }
3903                             else {
3904
3905                                 # Here casing wasn't different between the two.
3906                                 # If one has hyphens or underscores and the
3907                                 # other doesn't, choose the one with the
3908                                 # punctuation.
3909                                 my $new_punct = $value =~ /[-_]/;
3910                                 my $old_punct = $pre_existing =~ /[-_]/;
3911
3912                                 if ($old_punct != $new_punct) {
3913                                     $clean_insert = 0 if $new_punct;
3914                                     if (main::DEBUG && $to_trace) {
3915                                         if ($clean_insert) {
3916                                             trace "Retaining $pre_existing over $value";
3917                                         }
3918                                         else {
3919                                             trace "Replacing $pre_existing with $value";
3920                                         }
3921                                     }
3922                                 }   # else existing one is just as "good";
3923                                     # retain it to save cycles.
3924                             }
3925                         }
3926                     }
3927                 }
3928             }
3929         } # End of loop looking for highest affected range.
3930
3931         # Here, $j points to one beyond the highest range that this insertion
3932         # affects (hence to beyond the range list if that range is the final
3933         # one in the range list).
3934
3935         # The splice length is all the affected ranges.  Get it before
3936         # subtracting, for efficiency, so we don't have to later add 1.
3937         my $length = $j - $i;
3938
3939         $j--;        # $j now points to the highest affected range.
3940         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3941
3942         # Here, have taken care of $NO and $MULTIPLE_foo replaces.
3943         # $j points to the highest affected range.  But it can be < $i or even
3944         # -1.  These happen only if the insertion is entirely in the gap
3945         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
3946         # above exited first time through with $end < $r->[$i]->start.  (And
3947         # then we subtracted one from j)  This implies also that $start <
3948         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3949         # $start, so the entire input range is in the gap.
3950         if ($j < $i) {
3951
3952             # Here the entire input range is in the gap before $i.
3953
3954             if (main::DEBUG && $to_trace) {
3955                 if ($i) {
3956                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
3957                 }
3958                 else {
3959                     trace "Entire range is before $r->[$i]";
3960                 }
3961             }
3962             return if $operation ne '+'; # Deletion of a non-existent range is
3963                                          # a no-op
3964         }
3965         else {
3966
3967             # Here part of the input range is not in the gap before $i.  Thus,
3968             # there is at least one affected one, and $j points to the highest
3969             # such one.
3970
3971             # At this point, here is the situation:
3972             # This is not an insertion of a multiple, nor of tentative ($NO)
3973             # data.
3974             #   $i  points to the first element in the current range list that
3975             #            may be affected by this operation.  In fact, we know
3976             #            that the range at $i is affected because we are in
3977             #            the else branch of this 'if'
3978             #   $j  points to the highest affected range.
3979             # In other words,
3980             #   r[$i-1]->end < $start <= r[$i]->end
3981             # And:
3982             #   r[$i-1]->end < $start <= $end <= r[$j]->end
3983             #
3984             # Also:
3985             #   $clean_insert is a boolean which is set true if and only if
3986             #        this is a "clean insertion", i.e., not a change nor a
3987             #        deletion (multiple was handled above).
3988
3989             # We now have enough information to decide if this call is a no-op
3990             # or not.  It is a no-op if this is an insertion of already
3991             # existing data.
3992
3993             if (main::DEBUG && $to_trace && $clean_insert
3994                                          && $i == $j
3995                                          && $start >= $r->[$i]->start)
3996             {
3997                     trace "no-op";
3998             }
3999             return if $clean_insert
4000                       && $i == $j # more than one affected range => not no-op
4001
4002                       # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
4003                       # Further, $start and/or $end is >= r[$i]->start
4004                       # The test below hence guarantees that
4005                       #     r[$i]->start < $start <= $end <= r[$i]->end
4006                       # This means the input range is contained entirely in
4007                       # the one at $i, so is a no-op
4008                       && $start >= $r->[$i]->start;
4009         }
4010
4011         # Here, we know that some action will have to be taken.  We have
4012         # calculated the offset and length (though adjustments may be needed)
4013         # for the splice.  Now start constructing the replacement list.
4014         my @replacement;
4015         my $splice_start = $i;
4016
4017         my $extends_below;
4018         my $extends_above;
4019
4020         # See if should extend any adjacent ranges.
4021         if ($operation eq '-') { # Don't extend deletions
4022             $extends_below = $extends_above = 0;
4023         }
4024         else {  # Here, should extend any adjacent ranges.  See if there are
4025                 # any.
4026             $extends_below = ($i > 0
4027                             # can't extend unless adjacent
4028                             && $r->[$i-1]->end == $start -1
4029                             # can't extend unless are same standard value
4030                             && $r->[$i-1]->standard_form eq $standard_form
4031                             # can't extend unless share type
4032                             && $r->[$i-1]->type == $type);
4033             $extends_above = ($j+1 < $range_list_size
4034                             && $r->[$j+1]->start == $end +1
4035                             && $r->[$j+1]->standard_form eq $standard_form
4036                             && $r->[$j+1]->type == $type);
4037         }
4038         if ($extends_below && $extends_above) { # Adds to both
4039             $splice_start--;     # start replace at element below
4040             $length += 2;        # will replace on both sides
4041             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4042
4043             # The result will fill in any gap, replacing both sides, and
4044             # create one large range.
4045             @replacement = Range->new($r->[$i-1]->start,
4046                                       $r->[$j+1]->end,
4047                                       Value => $value,
4048                                       Type => $type);
4049         }
4050         else {
4051
4052             # Here we know that the result won't just be the conglomeration of
4053             # a new range with both its adjacent neighbors.  But it could
4054             # extend one of them.
4055
4056             if ($extends_below) {
4057
4058                 # Here the new element adds to the one below, but not to the
4059                 # one above.  If inserting, and only to that one range,  can
4060                 # just change its ending to include the new one.
4061                 if ($length == 0 && $clean_insert) {
4062                     $r->[$i-1]->set_end($end);
4063                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4064                     return;
4065                 }
4066                 else {
4067                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4068                     $splice_start--;        # start replace at element below
4069                     $length++;              # will replace the element below
4070                     $start = $r->[$i-1]->start;
4071                 }
4072             }
4073             elsif ($extends_above) {
4074
4075                 # Here the new element adds to the one above, but not below.
4076                 # Mirror the code above
4077                 if ($length == 0 && $clean_insert) {
4078                     $r->[$j+1]->set_start($start);
4079                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4080                     return;
4081                 }
4082                 else {
4083                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4084                     $length++;        # will replace the element above
4085                     $end = $r->[$j+1]->end;
4086                 }
4087             }
4088
4089             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4090
4091             # Finally, here we know there will have to be a splice.
4092             # If the change or delete affects only the highest portion of the
4093             # first affected range, the range will have to be split.  The
4094             # splice will remove the whole range, but will replace it by a new
4095             # range containing just the unaffected part.  So, in this case,
4096             # add to the replacement list just this unaffected portion.
4097             if (! $extends_below
4098                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4099             {
4100                 push @replacement,
4101                     Range->new($r->[$i]->start,
4102                                $start - 1,
4103                                Value => $r->[$i]->value,
4104                                Type => $r->[$i]->type);
4105             }
4106
4107             # In the case of an insert or change, but not a delete, we have to
4108             # put in the new stuff;  this comes next.
4109             if ($operation eq '+') {
4110                 push @replacement, Range->new($start,
4111                                               $end,
4112                                               Value => $value,
4113                                               Type => $type);
4114             }
4115
4116             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4117             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4118
4119             # And finally, if we're changing or deleting only a portion of the
4120             # highest affected range, it must be split, as the lowest one was.
4121             if (! $extends_above
4122                 && $j >= 0  # Remember that j can be -1 if before first
4123                             # current element
4124                 && $end >= $r->[$j]->start
4125                 && $end < $r->[$j]->end)
4126             {
4127                 push @replacement,
4128                     Range->new($end + 1,
4129                                $r->[$j]->end,
4130                                Value => $r->[$j]->value,
4131                                Type => $r->[$j]->type);
4132             }
4133         }
4134
4135         # And do the splice, as calculated above
4136         if (main::DEBUG && $to_trace) {
4137             trace "replacing $length element(s) at $i with ";
4138             foreach my $replacement (@replacement) {
4139                 trace "    $replacement";
4140             }
4141             trace "Before splice:";
4142             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4143             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4144             trace "i  =[", $i, "]", $r->[$i];
4145             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4146             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4147         }
4148
4149         my @return = splice @$r, $splice_start, $length, @replacement;
4150
4151         if (main::DEBUG && $to_trace) {
4152             trace "After splice:";
4153             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4154             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4155             trace "i  =[", $i, "]", $r->[$i];
4156             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4157             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4158             trace "removed ", @return if @return;
4159         }
4160
4161         # An actual deletion could have changed the maximum in the list.
4162         # There was no deletion if the splice didn't return something, but
4163         # otherwise recalculate it.  This is done too rarely to worry about
4164         # performance.
4165         if ($operation eq '-' && @return) {
4166             if (@$r) {
4167                 $max{$addr} = $r->[-1]->end;
4168             }
4169             else {  # Now empty
4170                 $max{$addr} = $max_init;
4171             }
4172         }
4173         return @return;
4174     }
4175
4176     sub reset_each_range {  # reset the iterator for each_range();
4177         my $self = shift;
4178         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4179
4180         no overloading;
4181         undef $each_range_iterator{pack 'J', $self};
4182         return;
4183     }
4184
4185     sub each_range {
4186         # Iterate over each range in a range list.  Results are undefined if
4187         # the range list is changed during the iteration.
4188
4189         my $self = shift;
4190         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4191
4192         my $addr = do { no overloading; pack 'J', $self; };
4193
4194         return if $self->is_empty;
4195
4196         $each_range_iterator{$addr} = -1
4197                                 if ! defined $each_range_iterator{$addr};
4198         $each_range_iterator{$addr}++;
4199         return $ranges{$addr}->[$each_range_iterator{$addr}]
4200                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
4201         undef $each_range_iterator{$addr};
4202         return;
4203     }
4204
4205     sub count {        # Returns count of code points in range list
4206         my $self = shift;
4207         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4208
4209         my $addr = do { no overloading; pack 'J', $self; };
4210
4211         my $count = 0;
4212         foreach my $range (@{$ranges{$addr}}) {
4213             $count += $range->end - $range->start + 1;
4214         }
4215         return $count;
4216     }
4217
4218     sub delete_range {    # Delete a range
4219         my $self = shift;
4220         my $start = shift;
4221         my $end = shift;
4222
4223         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4224
4225         return $self->_add_delete('-', $start, $end, "");
4226     }
4227
4228     sub is_empty { # Returns boolean as to if a range list is empty
4229         my $self = shift;
4230         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4231
4232         no overloading;
4233         return scalar @{$ranges{pack 'J', $self}} == 0;
4234     }
4235
4236     sub hash {
4237         # Quickly returns a scalar suitable for separating tables into
4238         # buckets, i.e. it is a hash function of the contents of a table, so
4239         # there are relatively few conflicts.
4240
4241         my $self = shift;
4242         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4243
4244         my $addr = do { no overloading; pack 'J', $self; };
4245
4246         # These are quickly computable.  Return looks like 'min..max;count'
4247         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4248     }
4249 } # End closure for _Range_List_Base
4250
4251 package Range_List;
4252 use base '_Range_List_Base';
4253
4254 # A Range_List is a range list for match tables; i.e. the range values are
4255 # not significant.  Thus a number of operations can be safely added to it,
4256 # such as inversion, intersection.  Note that union is also an unsafe
4257 # operation when range values are cared about, and that method is in the base
4258 # class, not here.  But things are set up so that that method is callable only
4259 # during initialization.  Only in this derived class, is there an operation
4260 # that combines two tables.  A Range_Map can thus be used to initialize a
4261 # Range_List, and its mappings will be in the list, but are not significant to
4262 # this class.
4263
4264 sub trace { return main::trace(@_); }
4265
4266 { # Closure
4267
4268     use overload
4269         fallback => 0,
4270         '+' => sub { my $self = shift;
4271                     my $other = shift;
4272
4273                     return $self->_union($other)
4274                 },
4275         '+=' => sub { my $self = shift;
4276                     my $other = shift;
4277                     my $reversed = shift;
4278
4279                     if ($reversed) {
4280                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4281                         . ref($other)
4282                         . ' += '
4283                         . ref($self)
4284                         . "'.  undef returned.");
4285                         return;
4286                     }
4287
4288                     return $self->_union($other)
4289                 },
4290         '&' => sub { my $self = shift;
4291                     my $other = shift;
4292
4293                     return $self->_intersect($other, 0);
4294                 },
4295         '&=' => sub { my $self = shift;
4296                     my $other = shift;
4297                     my $reversed = shift;
4298
4299                     if ($reversed) {
4300                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4301                         . ref($other)
4302                         . ' &= '
4303                         . ref($self)
4304                         . "'.  undef returned.");
4305                         return;
4306                     }
4307
4308                     return $self->_intersect($other, 0);
4309                 },
4310         '~' => "_invert",
4311         '-' => "_subtract",
4312     ;
4313
4314     sub _invert {
4315         # Returns a new Range_List that gives all code points not in $self.
4316
4317         my $self = shift;
4318
4319         my $new = Range_List->new;
4320
4321         # Go through each range in the table, finding the gaps between them
4322         my $max = -1;   # Set so no gap before range beginning at 0
4323         for my $range ($self->ranges) {
4324             my $start = $range->start;
4325             my $end   = $range->end;
4326
4327             # If there is a gap before this range, the inverse will contain
4328             # that gap.
4329             if ($start > $max + 1) {
4330                 $new->add_range($max + 1, $start - 1);
4331             }
4332             $max = $end;
4333         }
4334
4335         # And finally, add the gap from the end of the table to the max
4336         # possible code point
4337         if ($max < $MAX_UNICODE_CODEPOINT) {
4338             $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
4339         }
4340         return $new;
4341     }
4342
4343     sub _subtract {
4344         # Returns a new Range_List with the argument deleted from it.  The
4345         # argument can be a single code point, a range, or something that has
4346         # a range, with the _range_list() method on it returning them
4347
4348         my $self = shift;
4349         my $other = shift;
4350         my $reversed = shift;
4351         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4352
4353         if ($reversed) {
4354             Carp::my_carp_bug("Bad news.  Can't cope with '"
4355             . ref($other)
4356             . ' - '
4357             . ref($self)
4358             . "'.  undef returned.");
4359             return;
4360         }
4361
4362         my $new = Range_List->new(Initialize => $self);
4363
4364         if (! ref $other) { # Single code point
4365             $new->delete_range($other, $other);
4366         }
4367         elsif ($other->isa('Range')) {
4368             $new->delete_range($other->start, $other->end);
4369         }
4370         elsif ($other->can('_range_list')) {
4371             foreach my $range ($other->_range_list->ranges) {
4372                 $new->delete_range($range->start, $range->end);
4373             }
4374         }
4375         else {
4376             Carp::my_carp_bug("Can't cope with a "
4377                         . ref($other)
4378                         . " argument to '-'.  Subtraction ignored."
4379                         );
4380             return $self;
4381         }
4382
4383         return $new;
4384     }
4385
4386     sub _intersect {
4387         # Returns either a boolean giving whether the two inputs' range lists
4388         # intersect (overlap), or a new Range_List containing the intersection
4389         # of the two lists.  The optional final parameter being true indicates
4390         # to do the check instead of the intersection.
4391
4392         my $a_object = shift;
4393         my $b_object = shift;
4394         my $check_if_overlapping = shift;
4395         $check_if_overlapping = 0 unless defined $check_if_overlapping;
4396         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4397
4398         if (! defined $b_object) {
4399             my $message = "";
4400             $message .= $a_object->_owner_name_of if defined $a_object;
4401             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4402             return;
4403         }
4404
4405         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4406         # Thus the intersection could be much more simply be written:
4407         #   return ~(~$a_object + ~$b_object);
4408         # But, this is slower, and when taking the inverse of a large
4409         # range_size_1 table, back when such tables were always stored that
4410         # way, it became prohibitively slow, hence the code was changed to the
4411         # below
4412
4413         if ($b_object->isa('Range')) {
4414             $b_object = Range_List->new(Initialize => $b_object,
4415                                         Owner => $a_object->_owner_name_of);
4416         }
4417         $b_object = $b_object->_range_list if $b_object->can('_range_list');
4418
4419         my @a_ranges = $a_object->ranges;
4420         my @b_ranges = $b_object->ranges;
4421
4422         #local $to_trace = 1 if main::DEBUG;
4423         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4424
4425         # Start with the first range in each list
4426         my $a_i = 0;
4427         my $range_a = $a_ranges[$a_i];
4428         my $b_i = 0;
4429         my $range_b = $b_ranges[$b_i];
4430
4431         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4432                                                 if ! $check_if_overlapping;
4433
4434         # If either list is empty, there is no intersection and no overlap
4435         if (! defined $range_a || ! defined $range_b) {
4436             return $check_if_overlapping ? 0 : $new;
4437         }
4438         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4439
4440         # Otherwise, must calculate the intersection/overlap.  Start with the
4441         # very first code point in each list
4442         my $a = $range_a->start;
4443         my $b = $range_b->start;
4444
4445         # Loop through all the ranges of each list; in each iteration, $a and
4446         # $b are the current code points in their respective lists
4447         while (1) {
4448
4449             # If $a and $b are the same code point, ...
4450             if ($a == $b) {
4451
4452                 # it means the lists overlap.  If just checking for overlap
4453                 # know the answer now,
4454                 return 1 if $check_if_overlapping;
4455
4456                 # The intersection includes this code point plus anything else
4457                 # common to both current ranges.
4458                 my $start = $a;
4459                 my $end = main::min($range_a->end, $range_b->end);
4460                 if (! $check_if_overlapping) {
4461                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4462                     $new->add_range($start, $end);
4463                 }
4464
4465                 # Skip ahead to the end of the current intersect
4466                 $a = $b = $end;
4467
4468                 # If the current intersect ends at the end of either range (as
4469                 # it must for at least one of them), the next possible one
4470                 # will be the beginning code point in it's list's next range.
4471                 if ($a == $range_a->end) {
4472                     $range_a = $a_ranges[++$a_i];
4473                     last unless defined $range_a;
4474                     $a = $range_a->start;
4475                 }
4476                 if ($b == $range_b->end) {
4477                     $range_b = $b_ranges[++$b_i];
4478                     last unless defined $range_b;
4479                     $b = $range_b->start;
4480                 }
4481
4482                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4483             }
4484             elsif ($a < $b) {
4485
4486                 # Not equal, but if the range containing $a encompasses $b,
4487                 # change $a to be the middle of the range where it does equal
4488                 # $b, so the next iteration will get the intersection
4489                 if ($range_a->end >= $b) {
4490                     $a = $b;
4491                 }
4492                 else {
4493
4494                     # Here, the current range containing $a is entirely below
4495                     # $b.  Go try to find a range that could contain $b.
4496                     $a_i = $a_object->_search_ranges($b);
4497
4498                     # If no range found, quit.
4499                     last unless defined $a_i;
4500
4501                     # The search returns $a_i, such that
4502                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4503                     # Set $a to the beginning of this new range, and repeat.
4504                     $range_a = $a_ranges[$a_i];
4505                     $a = $range_a->start;
4506                 }
4507             }
4508             else { # Here, $b < $a.
4509
4510                 # Mirror image code to the leg just above
4511                 if ($range_b->end >= $a) {
4512                     $b = $a;
4513                 }
4514                 else {
4515                     $b_i = $b_object->_search_ranges($a);
4516                     last unless defined $b_i;
4517                     $range_b = $b_ranges[$b_i];
4518                     $b = $range_b->start;
4519                 }
4520             }
4521         } # End of looping through ranges.
4522
4523         # Intersection fully computed, or now know that there is no overlap
4524         return $check_if_overlapping ? 0 : $new;
4525     }
4526
4527     sub overlaps {
4528         # Returns boolean giving whether the two arguments overlap somewhere
4529
4530         my $self = shift;
4531         my $other = shift;
4532         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4533
4534         return $self->_intersect($other, 1);
4535     }
4536
4537     sub add_range {
4538         # Add a range to the list.
4539
4540         my $self = shift;
4541         my $start = shift;
4542         my $end = shift;
4543         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4544
4545         return $self->_add_delete('+', $start, $end, "");
4546     }
4547
4548     sub matches_identically_to {
4549         # Return a boolean as to whether or not two Range_Lists match identical
4550         # sets of code points.
4551
4552         my $self = shift;
4553         my $other = shift;
4554         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4555
4556         # These are ordered in increasing real time to figure out (at least
4557         # until a patch changes that and doesn't change this)
4558         return 0 if $self->max != $other->max;
4559         return 0 if $self->min != $other->min;
4560         return 0 if $self->range_count != $other->range_count;
4561         return 0 if $self->count != $other->count;
4562
4563         # Here they could be identical because all the tests above passed.
4564         # The loop below is somewhat simpler since we know they have the same
4565         # number of elements.  Compare range by range, until reach the end or
4566         # find something that differs.
4567         my @a_ranges = $self->ranges;
4568         my @b_ranges = $other->ranges;
4569         for my $i (0 .. @a_ranges - 1) {
4570             my $a = $a_ranges[$i];
4571             my $b = $b_ranges[$i];
4572             trace "self $a; other $b" if main::DEBUG && $to_trace;
4573             return 0 if ! defined $b
4574                         || $a->start != $b->start
4575                         || $a->end != $b->end;
4576         }
4577         return 1;
4578     }
4579
4580     sub is_code_point_usable {
4581         # This used only for making the test script.  See if the input
4582         # proposed trial code point is one that Perl will handle.  If second
4583         # parameter is 0, it won't select some code points for various
4584         # reasons, noted below.
4585
4586         my $code = shift;
4587         my $try_hard = shift;
4588         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4589
4590         return 0 if $code < 0;                # Never use a negative
4591
4592         # shun null.  I'm (khw) not sure why this was done, but NULL would be
4593         # the character very frequently used.
4594         return $try_hard if $code == 0x0000;
4595
4596         # shun non-character code points.
4597         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4598         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4599
4600         return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
4601         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4602
4603         return 1;
4604     }
4605
4606     sub get_valid_code_point {
4607         # Return a code point that's part of the range list.  Returns nothing
4608         # if the table is empty or we can't find a suitable code point.  This
4609         # used only for making the test script.
4610
4611         my $self = shift;
4612         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4613
4614         my $addr = do { no overloading; pack 'J', $self; };
4615
4616         # On first pass, don't choose less desirable code points; if no good
4617         # one is found, repeat, allowing a less desirable one to be selected.
4618         for my $try_hard (0, 1) {
4619
4620             # Look through all the ranges for a usable code point.
4621             for my $set (reverse $self->ranges) {
4622
4623                 # Try the edge cases first, starting with the end point of the
4624                 # range.
4625                 my $end = $set->end;
4626                 return $end if is_code_point_usable($end, $try_hard);
4627
4628                 # End point didn't, work.  Start at the beginning and try
4629                 # every one until find one that does work.
4630                 for my $trial ($set->start .. $end - 1) {
4631                     return $trial if is_code_point_usable($trial, $try_hard);
4632                 }
4633             }
4634         }
4635         return ();  # If none found, give up.
4636     }
4637
4638     sub get_invalid_code_point {
4639         # Return a code point that's not part of the table.  Returns nothing
4640         # if the table covers all code points or a suitable code point can't
4641         # be found.  This used only for making the test script.
4642
4643         my $self = shift;
4644         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4645
4646         # Just find a valid code point of the inverse, if any.
4647         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4648     }
4649 } # end closure for Range_List
4650
4651 package Range_Map;
4652 use base '_Range_List_Base';
4653
4654 # A Range_Map is a range list in which the range values (called maps) are
4655 # significant, and hence shouldn't be manipulated by our other code, which
4656 # could be ambiguous or lose things.  For example, in taking the union of two
4657 # lists, which share code points, but which have differing values, which one
4658 # has precedence in the union?
4659 # It turns out that these operations aren't really necessary for map tables,
4660 # and so this class was created to make sure they aren't accidentally
4661 # applied to them.
4662
4663 { # Closure
4664
4665     sub add_map {
4666         # Add a range containing a mapping value to the list
4667
4668         my $self = shift;
4669         # Rest of parameters passed on
4670
4671         return $self->_add_delete('+', @_);
4672     }
4673
4674     sub add_duplicate {
4675         # Adds entry to a range list which can duplicate an existing entry
4676
4677         my $self = shift;
4678         my $code_point = shift;
4679         my $value = shift;
4680         my %args = @_;
4681         my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4682         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4683
4684         return $self->add_map($code_point, $code_point,
4685                                 $value, Replace => $replace);
4686     }
4687 } # End of closure for package Range_Map
4688
4689 package _Base_Table;
4690
4691 # A table is the basic data structure that gets written out into a file for
4692 # use by the Perl core.  This is the abstract base class implementing the
4693 # common elements from the derived ones.  A list of the methods to be
4694 # furnished by an implementing class is just after the constructor.
4695
4696 sub standardize { return main::standardize($_[0]); }
4697 sub trace { return main::trace(@_); }
4698
4699 { # Closure
4700
4701     main::setup_package();
4702
4703     my %range_list;
4704     # Object containing the ranges of the table.
4705     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4706
4707     my %full_name;
4708     # The full table name.
4709     main::set_access('full_name', \%full_name, 'r');
4710
4711     my %name;
4712     # The table name, almost always shorter
4713     main::set_access('name', \%name, 'r');
4714
4715     my %short_name;
4716     # The shortest of all the aliases for this table, with underscores removed
4717     main::set_access('short_name', \%short_name);
4718
4719     my %nominal_short_name_length;
4720     # The length of short_name before removing underscores
4721     main::set_access('nominal_short_name_length',
4722                     \%nominal_short_name_length);
4723
4724     my %complete_name;
4725     # The complete name, including property.
4726     main::set_access('complete_name', \%complete_name, 'r');
4727
4728     my %property;
4729     # Parent property this table is attached to.
4730     main::set_access('property', \%property, 'r');
4731
4732     my %aliases;
4733     # Ordered list of alias objects of the table's name.  The first ones in
4734     # the list are output first in comments
4735     main::set_access('aliases', \%aliases, 'readable_array');
4736
4737     my %comment;
4738     # A comment associated with the table for human readers of the files
4739     main::set_access('comment', \%comment, 's');
4740
4741     my %description;
4742     # A comment giving a short description of the table's meaning for human
4743     # readers of the files.
4744     main::set_access('description', \%description, 'readable_array');
4745
4746     my %note;
4747     # A comment giving a short note about the table for human readers of the
4748     # files.
4749     main::set_access('note', \%note, 'readable_array');
4750
4751     my %fate;
4752     # Enum; there are a number of possibilities for what happens to this
4753     # table: it could be normal, or suppressed, or not for external use.  See
4754     # values at definition for $SUPPRESSED.
4755     main::set_access('fate', \%fate, 'r');
4756
4757     my %find_table_from_alias;
4758     # The parent property passes this pointer to a hash which this class adds
4759     # all its aliases to, so that the parent can quickly take an alias and
4760     # find this table.
4761     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4762
4763     my %locked;
4764     # After this table is made equivalent to another one; we shouldn't go
4765     # changing the contents because that could mean it's no longer equivalent
4766     main::set_access('locked', \%locked, 'r');
4767
4768     my %file_path;
4769     # This gives the final path to the file containing the table.  Each
4770     # directory in the path is an element in the array
4771     main::set_access('file_path', \%file_path, 'readable_array');
4772
4773     my %status;
4774     # What is the table's status, normal, $OBSOLETE, etc.  Enum
4775     main::set_access('status', \%status, 'r');
4776
4777     my %status_info;
4778     # A comment about its being obsolete, or whatever non normal status it has
4779     main::set_access('status_info', \%status_info, 'r');
4780
4781     my %caseless_equivalent;
4782     # The table this is equivalent to under /i matching, if any.
4783     main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4784
4785     my %range_size_1;
4786     # Is the table to be output with each range only a single code point?
4787     # This is done to avoid breaking existing code that may have come to rely
4788     # on this behavior in previous versions of this program.)
4789     main::set_access('range_size_1', \%range_size_1, 'r', 's');
4790
4791     my %perl_extension;
4792     # A boolean set iff this table is a Perl extension to the Unicode
4793     # standard.
4794     main::set_access('perl_extension', \%perl_extension, 'r');
4795
4796     my %output_range_counts;
4797     # A boolean set iff this table is to have comments written in the
4798     # output file that contain the number of code points in the range.
4799     # The constructor can override the global flag of the same name.
4800     main::set_access('output_range_counts', \%output_range_counts, 'r');
4801
4802     my %format;
4803     # The format of the entries of the table.  This is calculated from the
4804     # data in the table (or passed in the constructor).  This is an enum e.g.,
4805     # $STRING_FORMAT.  It is marked protected as it should not be generally
4806     # used to override calculations.
4807     main::set_access('format', \%format, 'r', 'p_s');
4808
4809     sub new {
4810         # All arguments are key => value pairs, which you can see below, most
4811         # of which match fields documented above.  Otherwise: Re_Pod_Entry,
4812         # OK_as_Filename, and Fuzzy apply to the names of the table, and are
4813         # documented in the Alias package
4814
4815         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4816
4817         my $class = shift;
4818
4819         my $self = bless \do { my $anonymous_scalar }, $class;
4820         my $addr = do { no overloading; pack 'J', $self; };
4821
4822         my %args = @_;
4823
4824         $name{$addr} = delete $args{'Name'};
4825         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4826         $full_name{$addr} = delete $args{'Full_Name'};
4827         my $complete_name = $complete_name{$addr}
4828                           = delete $args{'Complete_Name'};
4829         $format{$addr} = delete $args{'Format'};
4830         $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4831         $property{$addr} = delete $args{'_Property'};
4832         $range_list{$addr} = delete $args{'_Range_List'};
4833         $status{$addr} = delete $args{'Status'} || $NORMAL;
4834         $status_info{$addr} = delete $args{'_Status_Info'} || "";
4835         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4836         $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
4837         $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
4838         my $ucd = delete $args{'UCD'};
4839
4840         my $description = delete $args{'Description'};
4841         my $ok_as_filename = delete $args{'OK_as_Filename'};
4842         my $loose_match = delete $args{'Fuzzy'};
4843         my $note = delete $args{'Note'};
4844         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4845         my $perl_extension = delete $args{'Perl_Extension'};
4846
4847         # Shouldn't have any left over
4848         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4849
4850         # Can't use || above because conceivably the name could be 0, and
4851         # can't use // operator in case this program gets used in Perl 5.8
4852         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4853         $output_range_counts{$addr} = $output_range_counts if
4854                                         ! defined $output_range_counts{$addr};
4855
4856         $aliases{$addr} = [ ];
4857         $comment{$addr} = [ ];
4858         $description{$addr} = [ ];
4859         $note{$addr} = [ ];
4860         $file_path{$addr} = [ ];
4861         $locked{$addr} = "";
4862
4863         push @{$description{$addr}}, $description if $description;
4864         push @{$note{$addr}}, $note if $note;
4865
4866         if ($fate{$addr} == $PLACEHOLDER) {
4867
4868             # A placeholder table doesn't get documented, is a perl extension,
4869             # and quite likely will be empty
4870             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4871             $perl_extension = 1 if ! defined $perl_extension;
4872             $ucd = 0 if ! defined $ucd;
4873             push @tables_that_may_be_empty, $complete_name{$addr};
4874             $self->add_comment(<<END);
4875 This is a placeholder because it is not in Version $string_version of Unicode,
4876 but is needed by the Perl core to work gracefully.  Because it is not in this
4877 version of Unicode, it will not be listed in $pod_file.pod
4878 END
4879         }
4880         elsif (exists $why_suppressed{$complete_name}
4881                 # Don't suppress if overridden
4882                 && ! grep { $_ eq $complete_name{$addr} }
4883                                                     @output_mapped_properties)
4884         {
4885             $fate{$addr} = $SUPPRESSED;
4886         }
4887         elsif ($fate{$addr} == $SUPPRESSED
4888                && ! exists $why_suppressed{$property{$addr}->complete_name})
4889         {
4890             Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
4891             # perhaps Fate => [ $SUPPRESSED, "reason" ]
4892         }
4893
4894         # If hasn't set its status already, see if it is on one of the
4895         # lists of properties or tables that have particular statuses; if
4896         # not, is normal.  The lists are prioritized so the most serious
4897         # ones are checked first
4898         if (! $status{$addr}) {
4899             if (exists $why_deprecated{$complete_name}) {
4900                 $status{$addr} = $DEPRECATED;
4901             }
4902             elsif (exists $why_stabilized{$complete_name}) {
4903                 $status{$addr} = $STABILIZED;
4904             }
4905             elsif (exists $why_obsolete{$complete_name}) {
4906                 $status{$addr} = $OBSOLETE;
4907             }
4908
4909             # Existence above doesn't necessarily mean there is a message
4910             # associated with it.  Use the most serious message.
4911             if ($status{$addr}) {
4912                 if ($why_deprecated{$complete_name}) {
4913                     $status_info{$addr}
4914                                 = $why_deprecated{$complete_name};
4915                 }
4916                 elsif ($why_stabilized{$complete_name}) {
4917                     $status_info{$addr}
4918                                 = $why_stabilized{$complete_name};
4919                 }
4920                 elsif ($why_obsolete{$complete_name}) {
4921                     $status_info{$addr}
4922                                 = $why_obsolete{$complete_name};
4923                 }
4924             }
4925         }
4926
4927         $perl_extension{$addr} = $perl_extension || 0;
4928
4929         # Don't list a property by default that is internal only
4930         if ($fate{$addr} > $MAP_PROXIED) {
4931             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4932             $ucd = 0 if ! defined $ucd;
4933         }
4934         else {
4935             $ucd = 1 if ! defined $ucd;
4936         }
4937
4938         # By convention what typically gets printed only or first is what's
4939         # first in the list, so put the full name there for good output
4940         # clarity.  Other routines rely on the full name being first on the
4941         # list
4942         $self->add_alias($full_name{$addr},
4943                             OK_as_Filename => $ok_as_filename,
4944                             Fuzzy => $loose_match,
4945                             Re_Pod_Entry => $make_re_pod_entry,
4946                             Status => $status{$addr},
4947                             UCD => $ucd,
4948                             );
4949
4950         # Then comes the other name, if meaningfully different.
4951         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4952             $self->add_alias($name{$addr},
4953                             OK_as_Filename => $ok_as_filename,
4954                             Fuzzy => $loose_match,
4955                             Re_Pod_Entry => $make_re_pod_entry,
4956                             Status => $status{$addr},
4957                             UCD => $ucd,
4958                             );
4959         }
4960
4961         return $self;
4962     }
4963
4964     # Here are the methods that are required to be defined by any derived
4965     # class
4966     for my $sub (qw(
4967                     handle_special_range
4968                     append_to_body
4969                     pre_body
4970                 ))
4971                 # write() knows how to write out normal ranges, but it calls
4972                 # handle_special_range() when it encounters a non-normal one.
4973                 # append_to_body() is called by it after it has handled all
4974                 # ranges to add anything after the main portion of the table.
4975                 # And finally, pre_body() is called after all this to build up
4976                 # anything that should appear before the main portion of the
4977                 # table.  Doing it this way allows things in the middle to
4978                 # affect what should appear before the main portion of the
4979                 # table.
4980     {
4981         no strict "refs";
4982         *$sub = sub {
4983             Carp::my_carp_bug( __LINE__
4984                               . ": Must create method '$sub()' for "
4985                               . ref shift);
4986             return;
4987         }
4988     }
4989
4990     use overload
4991         fallback => 0,
4992         "." => \&main::_operator_dot,
4993         ".=" => \&main::_operator_dot_equal,
4994         '!=' => \&main::_operator_not_equal,
4995         '==' => \&main::_operator_equal,
4996     ;
4997
4998     sub ranges {
4999         # Returns the array of ranges associated with this table.
5000
5001         no overloading;
5002         return $range_list{pack 'J', shift}->ranges;
5003     }
5004
5005     sub add_alias {
5006         # Add a synonym for this table.
5007
5008         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5009
5010         my $self = shift;
5011         my $name = shift;       # The name to add.
5012         my $pointer = shift;    # What the alias hash should point to.  For
5013                                 # map tables, this is the parent property;
5014                                 # for match tables, it is the table itself.
5015
5016         my %args = @_;
5017         my $loose_match = delete $args{'Fuzzy'};
5018
5019         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5020         $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
5021
5022         my $ok_as_filename = delete $args{'OK_as_Filename'};
5023         $ok_as_filename = 1 unless defined $ok_as_filename;
5024
5025         my $status = delete $args{'Status'};
5026         $status = $NORMAL unless defined $status;
5027
5028         # An internal name does not get documented, unless overridden by the
5029         # input.
5030         my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5031
5032         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5033
5034         # Capitalize the first letter of the alias unless it is one of the CJK
5035         # ones which specifically begins with a lower 'k'.  Do this because
5036         # Unicode has varied whether they capitalize first letters or not, and
5037         # have later changed their minds and capitalized them, but not the
5038         # other way around.  So do it always and avoid changes from release to
5039         # release
5040         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5041
5042         my $addr = do { no overloading; pack 'J', $self; };
5043
5044         # Figure out if should be loosely matched if not already specified.
5045         if (! defined $loose_match) {
5046
5047             # Is a loose_match if isn't null, and doesn't begin with an
5048             # underscore and isn't just a number
5049             if ($name ne ""
5050                 && substr($name, 0, 1) ne '_'
5051                 && $name !~ qr{^[0-9_.+-/]+$})
5052             {
5053                 $loose_match = 1;
5054             }
5055             else {
5056                 $loose_match = 0;
5057             }
5058         }
5059
5060         # If this alias has already been defined, do nothing.
5061         return if defined $find_table_from_alias{$addr}->{$name};
5062
5063         # That includes if it is standardly equivalent to an existing alias,
5064         # in which case, add this name to the list, so won't have to search
5065         # for it again.
5066         my $standard_name = main::standardize($name);
5067         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5068             $find_table_from_alias{$addr}->{$name}
5069                         = $find_table_from_alias{$addr}->{$standard_name};
5070             return;
5071         }
5072
5073         # Set the index hash for this alias for future quick reference.
5074         $find_table_from_alias{$addr}->{$name} = $pointer;
5075         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5076         local $to_trace = 0 if main::DEBUG;
5077         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5078         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5079
5080
5081         # Put the new alias at the end of the list of aliases unless the final
5082         # element begins with an underscore (meaning it is for internal perl
5083         # use) or is all numeric, in which case, put the new one before that
5084         # one.  This floats any all-numeric or underscore-beginning aliases to
5085         # the end.  This is done so that they are listed last in output lists,
5086         # to encourage the user to use a better name (either more descriptive
5087         # or not an internal-only one) instead.  This ordering is relied on
5088         # implicitly elsewhere in this program, like in short_name()
5089         my $list = $aliases{$addr};
5090         my $insert_position = (@$list == 0
5091                                 || (substr($list->[-1]->name, 0, 1) ne '_'
5092                                     && $list->[-1]->name =~ /\D/))
5093                             ? @$list
5094                             : @$list - 1;
5095         splice @$list,
5096                 $insert_position,
5097                 0,
5098                 Alias->new($name, $loose_match, $make_re_pod_entry,
5099                                                 $ok_as_filename, $status, $ucd);
5100
5101         # This name may be shorter than any existing ones, so clear the cache
5102         # of the shortest, so will have to be recalculated.
5103         no overloading;
5104         undef $short_name{pack 'J', $self};
5105         return;
5106     }
5107
5108     sub short_name {
5109         # Returns a name suitable for use as the base part of a file name.
5110         # That is, shorter wins.  It can return undef if there is no suitable
5111         # name.  The name has all non-essential underscores removed.
5112
5113         # The optional second parameter is a reference to a scalar in which
5114         # this routine will store the length the returned name had before the
5115         # underscores were removed, or undef if the return is undef.
5116
5117         # The shortest name can change if new aliases are added.  So using
5118         # this should be deferred until after all these are added.  The code
5119         # that does that should clear this one's cache.
5120         # Any name with alphabetics is preferred over an all numeric one, even
5121         # if longer.
5122
5123         my $self = shift;
5124         my $nominal_length_ptr = shift;
5125         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5126
5127         my $addr = do { no overloading; pack 'J', $self; };
5128
5129         # For efficiency, don't recalculate, but this means that adding new
5130         # aliases could change what the shortest is, so the code that does
5131         # that needs to undef this.
5132         if (defined $short_name{$addr}) {
5133             if ($nominal_length_ptr) {
5134                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5135             }
5136             return $short_name{$addr};
5137         }
5138
5139         # Look at each alias
5140         foreach my $alias ($self->aliases()) {
5141
5142             # Don't use an alias that isn't ok to use for an external name.
5143             next if ! $alias->ok_as_filename;
5144
5145             my $name = main::Standardize($alias->name);
5146             trace $self, $name if main::DEBUG && $to_trace;
5147
5148             # Take the first one, or a shorter one that isn't numeric.  This
5149             # relies on numeric aliases always being last in the array
5150             # returned by aliases().  Any alpha one will have precedence.
5151             if (! defined $short_name{$addr}
5152                 || ($name =~ /\D/
5153                     && length($name) < length($short_name{$addr})))
5154             {
5155                 # Remove interior underscores.
5156                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5157
5158                 $nominal_short_name_length{$addr} = length $name;
5159             }
5160         }
5161
5162         # If the short name isn't a nice one, perhaps an equivalent table has
5163         # a better one.
5164         if (! defined $short_name{$addr}
5165             || $short_name{$addr} eq ""
5166             || $short_name{$addr} eq "_")
5167         {
5168             my $return;
5169             foreach my $follower ($self->children) {    # All equivalents
5170                 my $follower_name = $follower->short_name;
5171                 next unless defined $follower_name;
5172
5173                 # Anything (except undefined) is better than underscore or
5174                 # empty
5175                 if (! defined $return || $return eq "_") {
5176                     $return = $follower_name;
5177                     next;
5178                 }
5179
5180                 # If the new follower name isn't "_" and is shorter than the
5181                 # current best one, prefer the new one.
5182                 next if $follower_name eq "_";
5183                 next if length $follower_name > length $return;
5184                 $return = $follower_name;
5185             }
5186             $short_name{$addr} = $return if defined $return;
5187         }
5188
5189         # If no suitable external name return undef
5190         if (! defined $short_name{$addr}) {
5191             $$nominal_length_ptr = undef if $nominal_length_ptr;
5192             return;
5193         }
5194
5195         # Don't allow a null short name.
5196         if ($short_name{$addr} eq "") {
5197             $short_name{$addr} = '_';
5198             $nominal_short_name_length{$addr} = 1;
5199         }
5200
5201         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5202
5203         if ($nominal_length_ptr) {
5204             $$nominal_length_ptr = $nominal_short_name_length{$addr};
5205         }
5206         return $short_name{$addr};
5207     }
5208
5209     sub external_name {
5210         # Returns the external name that this table should be known by.  This
5211         # is usually the short_name, but not if the short_name is undefined,
5212         # in which case the external_name is arbitrarily set to the
5213         # underscore.
5214
5215         my $self = shift;
5216         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5217
5218         my $short = $self->short_name;
5219         return $short if defined $short;
5220
5221         return '_';
5222     }
5223
5224     sub add_description { # Adds the parameter as a short description.
5225
5226         my $self = shift;
5227         my $description = shift;
5228         chomp $description;
5229         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5230
5231         no overloading;
5232         push @{$description{pack 'J', $self}}, $description;
5233
5234         return;
5235     }
5236
5237     sub add_note { # Adds the parameter as a short note.
5238
5239         my $self = shift;
5240         my $note = shift;
5241         chomp $note;
5242         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5243
5244         no overloading;
5245         push @{$note{pack 'J', $self}}, $note;
5246
5247         return;
5248     }
5249
5250     sub add_comment { # Adds the parameter as a comment.
5251
5252         return unless $debugging_build;
5253
5254         my $self = shift;
5255         my $comment = shift;
5256         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5257
5258         chomp $comment;
5259
5260         no overloading;
5261         push @{$comment{pack 'J', $self}}, $comment;
5262
5263         return;
5264     }
5265
5266     sub comment {
5267         # Return the current comment for this table.  If called in list
5268         # context, returns the array of comments.  In scalar, returns a string
5269         # of each element joined together with a period ending each.
5270
5271         my $self = shift;
5272         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5273
5274         my $addr = do { no overloading; pack 'J', $self; };
5275         my @list = @{$comment{$addr}};
5276         return @list if wantarray;
5277         my $return = "";
5278         foreach my $sentence (@list) {
5279             $return .= '.  ' if $return;
5280             $return .= $sentence;
5281             $return =~ s/\.$//;
5282         }
5283         $return .= '.' if $return;
5284         return $return;
5285     }
5286
5287     sub initialize {
5288         # Initialize the table with the argument which is any valid
5289         # initialization for range lists.
5290
5291         my $self = shift;
5292         my $addr = do { no overloading; pack 'J', $self; };
5293         my $initialization = shift;
5294         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5295
5296         # Replace the current range list with a new one of the same exact
5297         # type.
5298         my $class = ref $range_list{$addr};
5299         $range_list{$addr} = $class->new(Owner => $self,
5300                                         Initialize => $initialization);
5301         return;
5302
5303     }
5304
5305     sub header {
5306         # The header that is output for the table in the file it is written
5307         # in.
5308
5309         my $self = shift;
5310         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5311
5312         my $return = "";
5313         $return .= $DEVELOPMENT_ONLY if $compare_versions;
5314         $return .= $HEADER;
5315         return $return;
5316     }
5317
5318     sub write {
5319         # Write a representation of the table to its file.  It calls several
5320         # functions furnished by sub-classes of this abstract base class to
5321         # handle non-normal ranges, to add stuff before the table, and at its
5322         # end.  If the table is to be written so that adjustments are
5323         # required, this does that conversion.
5324
5325         my $self = shift;
5326         my $use_adjustments = shift; # ? output in adjusted format or not
5327         my $tab_stops = shift;       # The number of tab stops over to put any
5328                                      # comment.
5329         my $suppress_value = shift;  # Optional, if the value associated with
5330                                      # a range equals this one, don't write
5331                                      # the range
5332         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5333
5334         my $addr = do { no overloading; pack 'J', $self; };
5335
5336         # Start with the header
5337         my @HEADER = $self->header;
5338
5339         # Then the comments
5340         push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5341                                                         if $comment{$addr};
5342
5343         # Things discovered processing the main body of the document may
5344         # affect what gets output before it, therefore pre_body() isn't called
5345         # until after all other processing of the table is done.
5346
5347         # The main body looks like a 'here' document.  If annotating, get rid
5348         # of the comments before passing to the caller, as some callers, such
5349         # as charnames.pm, can't cope with them.  (Outputting range counts
5350         # also introduces comments, but these don't show up in the tables that
5351         # can't cope with comments, and there aren't that many of them that
5352         # it's worth the extra real time to get rid of them).
5353         my @OUT;
5354         if ($annotate) {
5355             # Use the line below in Perls that don't have /r
5356             #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5357             push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5358         } else {
5359             push @OUT, "return <<'END';\n";
5360         }
5361
5362         if ($range_list{$addr}->is_empty) {
5363
5364             # This is a kludge for empty tables to silence a warning in
5365             # utf8.c, which can't really deal with empty tables, but it can
5366             # deal with a table that matches nothing, as the inverse of 'Any'
5367             # does.
5368             push @OUT, "!utf8::Any\n";
5369         }
5370         elsif ($self->name eq 'N'
5371
5372                # To save disk space and table cache space, avoid putting out
5373                # binary N tables, but instead create a file which just inverts
5374                # the Y table.  Since the file will still exist and occupy a
5375                # certain number of blocks, might as well output the whole
5376                # thing if it all will fit in one block.   The number of
5377                # ranges below is an approximate number for that.
5378                && ($self->property->type == $BINARY
5379                    || $self->property->type == $FORCED_BINARY)
5380                # && $self->property->tables == 2  Can't do this because the
5381                #        non-binary properties, like NFDQC aren't specifiable
5382                #        by the notation
5383                && $range_list{$addr}->ranges > 15
5384                && ! $annotate)  # Under --annotate, want to see everything
5385         {
5386             push @OUT, "!utf8::" . $self->property->name . "\n";
5387         }
5388         else {
5389             my $range_size_1 = $range_size_1{$addr};
5390             my $format;            # Used only in $annotate option
5391             my $include_name;      # Used only in $annotate option
5392
5393             if ($annotate) {
5394
5395                 # If annotating each code point, must print 1 per line.
5396                 # The variable could point to a subroutine, and we don't want
5397                 # to lose that fact, so only set if not set already
5398                 $range_size_1 = 1 if ! $range_size_1;
5399
5400                 $format = $self->format;
5401
5402                 # The name of the character is output only for tables that
5403                 # don't already include the name in the output.
5404                 my $property = $self->property;
5405                 $include_name =
5406                     !  ($property == $perl_charname
5407                         || $property == main::property_ref('Unicode_1_Name')
5408                         || $property == main::property_ref('Name')
5409                         || $property == main::property_ref('Name_Alias')
5410                        );
5411             }
5412
5413             # Values for previous time through the loop.  Initialize to
5414             # something that won't be adjacent to the first iteration;
5415             # only $previous_end matters for that.
5416             my $previous_start;
5417             my $previous_end = -2;
5418             my $previous_value;
5419
5420             # Values for next time through the portion of the loop that splits
5421             # the range.  0 in $next_start means there is no remaining portion
5422             # to deal with.
5423             my $next_start = 0;
5424             my $next_end;
5425             my $next_value;
5426             my $offset = 0;
5427
5428             # Output each range as part of the here document.
5429             RANGE:
5430             for my $set ($range_list{$addr}->ranges) {
5431                 if ($set->type != 0) {
5432                     $self->handle_special_range($set);
5433                     next RANGE;
5434                 }
5435                 my $start = $set->start;
5436                 my $end   = $set->end;
5437                 my $value  = $set->value;
5438
5439                 # Don't output ranges whose value is the one to suppress
5440                 next RANGE if defined $suppress_value
5441                               && $value eq $suppress_value;
5442
5443                 {   # This bare block encloses the scope where we may need to
5444                     # split a range (when outputting adjusteds), and each time
5445                     # through we handle the next portion of the original by
5446                     # ending the block with a 'redo'.   The values to use for
5447                     # that next time through are set up just below in the
5448                     # scalars whose names begin with '$next_'.
5449
5450                     if ($use_adjustments) {
5451
5452                         # When converting to use adjustments, we can handle
5453                         # only single element ranges.  Set up so that this
5454                         # time through the loop, we look at the first element,
5455                         # and the next time through, we start off with the
5456                         # remainder.  Thus each time through we look at the
5457                         # first element of the range
5458                         if ($end != $start) {
5459                             $next_start = $start + 1;
5460                             $next_end = $end;
5461                             $next_value = $value;
5462                             $end = $start;
5463                         }
5464
5465                         # The values for some of these tables are stored as
5466                         # hex strings.  Convert those to decimal
5467                         $value = hex($value)
5468                                     if $self->default_map eq $CODE_POINT
5469                                         && $value =~ / ^ [A-Fa-f0-9]+ $ /x;
5470
5471                         # If this range is adjacent to the previous one, and
5472                         # the values in each are integers that are also
5473                         # adjacent (differ by 1), then this range really
5474                         # extends the previous one that is already in element
5475                         # $OUT[-1].  So we pop that element, and pretend that
5476                         # the range starts with whatever it started with.
5477                         # $offset is incremented by 1 each time so that it
5478                         # gives the current offset from the first element in
5479                         # the accumulating range, and we keep in $value the
5480                         # value of that first element.
5481                         if ($start == $previous_end + 1
5482                             && $value =~ /^ -? \d+ $/xa
5483                             && $previous_value =~ /^ -? \d+ $/xa
5484                             && ($value == ($previous_value + ++$offset)))
5485                         {
5486                             pop @OUT;
5487                             $start = $previous_start;
5488                             $value = $previous_value;
5489                         }
5490                         else {
5491                             $offset = 0;
5492                         }
5493
5494                         # Save the current values for the next time through
5495                         # the loop.
5496                         $previous_start = $start;
5497                         $previous_end = $end;
5498                         $previous_value = $value;
5499                     }
5500
5501                     # If there is a range and doesn't need a single point range
5502                     # output
5503                     if ($start != $end && ! $range_size_1) {
5504                         push @OUT, sprintf "%04X\t%04X", $start, $end;
5505                         $OUT[-1] .= "\t$value" if $value ne "";
5506
5507                         # Add a comment with the size of the range, if
5508                         # requested.  Expand Tabs to make sure they all start
5509                         # in the same column, and then unexpand to use mostly
5510                         # tabs.
5511                         if (! $output_range_counts{$addr}) {
5512                             $OUT[-1] .= "\n";
5513                         }
5514                         else {
5515                             $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5516                             my $count = main::clarify_number($end - $start + 1);
5517                             use integer;
5518
5519                             my $width = $tab_stops * 8 - 1;
5520                             $OUT[-1] = sprintf("%-*s # [%s]\n",
5521                                                 $width,
5522                                                 $OUT[-1],
5523                                                 $count);
5524                             $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5525                         }
5526                     }
5527
5528                         # Here to output a single code point per line.
5529                         # If not to annotate, use the simple formats
5530                     elsif (! $annotate) {
5531
5532                         # Use any passed in subroutine to output.
5533                         if (ref $range_size_1 eq 'CODE') {
5534                             for my $i ($start .. $end) {
5535                                 push @OUT, &{$range_size_1}($i, $value);
5536                             }
5537                         }
5538                         else {
5539
5540                             # Here, caller is ok with default output.
5541                             for (my $i = $start; $i <= $end; $i++) {
5542                                 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5543                             }
5544                         }
5545                     }
5546                     else {
5547
5548                         # Here, wants annotation.
5549                         for (my $i = $start; $i <= $end; $i++) {
5550
5551                             # Get character information if don't have it already
5552                             main::populate_char_info($i)
5553                                                 if ! defined $viacode[$i];
5554                             my $type = $annotate_char_type[$i];
5555
5556                             # Figure out if should output the next code points
5557                             # as part of a range or not.  If this is not in an
5558                             # annotation range, then won't output as a range,
5559                             # so returns $i.  Otherwise use the end of the
5560                             # annotation range, but no further than the
5561                             # maximum possible end point of the loop.
5562                             my $range_end = main::min(
5563                                         $annotate_ranges->value_of($i) || $i,
5564                                         $end);
5565
5566                             # Use a range if it is a range, and either is one
5567                             # of the special annotation ranges, or the range
5568                             # is at most 3 long.  This last case causes the
5569                             # algorithmically named code points to be output
5570                             # individually in spans of at most 3, as they are
5571                             # the ones whose $type is > 0.
5572                             if ($range_end != $i
5573                                 && ( $type < 0 || $range_end - $i > 2))
5574                             {
5575                                 # Here is to output a range.  We don't allow a
5576                                 # caller-specified output format--just use the
5577                                 # standard one.
5578                                 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5579                                                                 $range_end,
5580                                                                 $value;
5581                                 my $range_name = $viacode[$i];
5582
5583                                 # For the code points which end in their hex
5584                                 # value, we eliminate that from the output
5585                                 # annotation, and capitalize only the first
5586                                 # letter of each word.
5587                                 if ($type == $CP_IN_NAME) {
5588                                     my $hex = sprintf "%04X", $i;
5589                                     $range_name =~ s/-$hex$//;
5590                                     my @words = split " ", $range_name;
5591                                     for my $word (@words) {
5592                                         $word =
5593                                           ucfirst(lc($word)) if $word ne 'CJK';
5594                                     }
5595                                     $range_name = join " ", @words;
5596                                 }
5597                                 elsif ($type == $HANGUL_SYLLABLE) {
5598                                     $range_name = "Hangul Syllable";
5599                                 }
5600
5601                                 $OUT[-1] .= " $range_name" if $range_name;
5602
5603                                 # Include the number of code points in the
5604                                 # range
5605                                 my $count =
5606                                     main::clarify_number($range_end - $i + 1);
5607                                 $OUT[-1] .= " [$count]\n";
5608
5609                                 # Skip to the end of the range
5610                                 $i = $range_end;
5611                             }
5612                             else { # Not in a range.
5613                                 my $comment = "";
5614
5615                                 # When outputting the names of each character,
5616                                 # use the character itself if printable
5617                                 $comment .= "'" . chr($i) . "' "
5618                                                             if $printable[$i];
5619
5620                                 # To make it more readable, use a minimum
5621                                 # indentation
5622                                 my $comment_indent;
5623
5624                                 # Determine the annotation
5625                                 if ($format eq $DECOMP_STRING_FORMAT) {
5626
5627                                     # This is very specialized, with the type
5628                                     # of decomposition beginning the line
5629                                     # enclosed in <...>, and the code points
5630                                     # that the code point decomposes to
5631                                     # separated by blanks.  Create two
5632                                     # strings, one of the printable
5633                                     # characters, and one of their official
5634                                     # names.
5635                                     (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5636                                     my $tostr = "";
5637                                     my $to_name = "";
5638                                     my $to_chr = "";
5639                                     foreach my $to (split " ", $map) {
5640                                         $to = CORE::hex $to;
5641                                         $to_name .= " + " if $to_name;
5642                                         $to_chr .= chr($to);
5643                                         main::populate_char_info($to)
5644                                                     if ! defined $viacode[$to];
5645                                         $to_name .=  $viacode[$to];
5646                                     }
5647
5648                                     $comment .=
5649                                     "=> '$to_chr'; $viacode[$i] => $to_name";
5650                                     $comment_indent = 25;   # Determined by
5651                                                             # experiment
5652                                 }
5653                                 else {
5654
5655                                     # Assume that any table that has hex
5656                                     # format is a mapping of one code point to
5657                                     # another.
5658                                     if ($format eq $HEX_FORMAT) {
5659                                         my $decimal_value = CORE::hex $value;
5660                                         main::populate_char_info($decimal_value)
5661                                         if ! defined $viacode[$decimal_value];
5662                                         $comment .= "=> '"
5663                                         . chr($decimal_value)
5664                                         . "'; " if $printable[$decimal_value];
5665                                     }
5666                                     $comment .= $viacode[$i] if $include_name
5667                                                             && $viacode[$i];
5668                                     if ($format eq $HEX_FORMAT) {
5669                                         my $decimal_value = CORE::hex $value;
5670                                         $comment .=
5671                                             " => $viacode[$decimal_value]"
5672                                                 if $viacode[$decimal_value];
5673                                     }
5674
5675                                     # If including the name, no need to
5676                                     # indent, as the name will already be way
5677                                     # across the line.
5678                                     $comment_indent = ($include_name) ? 0 : 60;
5679                                 }
5680
5681                                 # Use any passed in routine to output the base
5682                                 # part of the line.
5683                                 if (ref $range_size_1 eq 'CODE') {
5684                                     my $base_part=&{$range_size_1}($i, $value);
5685                                     chomp $base_part;
5686                                     push @OUT, $base_part;
5687                                 }
5688                                 else {
5689                                     push @OUT, sprintf "%04X\t\t%s", $i, $value;
5690                                 }
5691
5692                                 # And add the annotation.
5693                                 $OUT[-1] = sprintf "%-*s\t# %s",
5694                                                    $comment_indent,
5695                                                    $OUT[-1],
5696                                                    $comment
5697                                             if $comment;
5698                                 $OUT[-1] .= "\n";
5699                             }
5700                         }
5701                     }
5702
5703                     # If we split the range, set up so the next time through
5704                     # we get the remainder, and redo.
5705                     if ($next_start) {
5706                         $start = $next_start;
5707                         $end = $next_end;
5708                         $value = $next_value;
5709                         $next_start = 0;
5710                         redo;
5711                     }
5712                 }
5713             } # End of loop through all the table's ranges
5714         }
5715
5716         # Add anything that goes after the main body, but within the here
5717         # document,
5718         my $append_to_body = $self->append_to_body;
5719         push @OUT, $append_to_body if $append_to_body;
5720
5721         # And finish the here document.
5722         push @OUT, "END\n";
5723
5724         # Done with the main portion of the body.  Can now figure out what
5725         # should appear before it in the file.
5726         my $pre_body = $self->pre_body;
5727         push @HEADER, $pre_body, "\n" if $pre_body;
5728
5729         # All these files should have a .pl suffix added to them.
5730         my @file_with_pl = @{$file_path{$addr}};
5731         $file_with_pl[-1] .= '.pl';
5732
5733         main::write(\@file_with_pl,
5734                     $annotate,      # utf8 iff annotating
5735                     \@HEADER,
5736                     \@OUT);
5737         return;
5738     }
5739
5740     sub set_status {    # Set the table's status
5741         my $self = shift;
5742         my $status = shift; # The status enum value
5743         my $info = shift;   # Any message associated with it.
5744         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5745
5746         my $addr = do { no overloading; pack 'J', $self; };
5747
5748         $status{$addr} = $status;
5749         $status_info{$addr} = $info;
5750         return;
5751     }
5752
5753     sub set_fate {  # Set the fate of a table
5754         my $self = shift;
5755         my $fate = shift;
5756         my $reason = shift;
5757         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5758
5759         my $addr = do { no overloading; pack 'J', $self; };
5760
5761         return if $fate{$addr} == $fate;    # If no-op
5762
5763         # Can only change the ordinary fate, except if going to $MAP_PROXIED
5764         return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
5765
5766         $fate{$addr} = $fate;
5767
5768         # Don't document anything to do with a non-normal fated table
5769         if ($fate != $ORDINARY) {
5770             my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
5771             foreach my $alias ($self->aliases) {
5772                 $alias->set_ucd($put_in_pod);
5773
5774                 # MAP_PROXIED doesn't affect the match tables
5775                 next if $fate == $MAP_PROXIED;
5776                 $alias->set_make_re_pod_entry($put_in_pod);
5777             }
5778         }
5779
5780         # Save the reason for suppression for output
5781         if ($fate == $SUPPRESSED && defined $reason) {
5782             $why_suppressed{$complete_name{$addr}} = $reason;
5783         }
5784
5785         return;
5786     }
5787
5788     sub lock {
5789         # Don't allow changes to the table from now on.  This stores a stack
5790         # trace of where it was called, so that later attempts to modify it
5791         # can immediately show where it got locked.
5792
5793         my $self = shift;
5794         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5795
5796         my $addr = do { no overloading; pack 'J', $self; };
5797
5798         $locked{$addr} = "";
5799
5800         my $line = (caller(0))[2];
5801         my $i = 1;
5802
5803         # Accumulate the stack trace
5804         while (1) {
5805             my ($pkg, $file, $caller_line, $caller) = caller $i++;
5806
5807             last unless defined $caller;
5808
5809             $locked{$addr} .= "    called from $caller() at line $line\n";
5810             $line = $caller_line;
5811         }
5812         $locked{$addr} .= "    called from main at line $line\n";
5813
5814         return;
5815     }
5816
5817     sub carp_if_locked {
5818         # Return whether a table is locked or not, and, by the way, complain
5819         # if is locked
5820
5821         my $self = shift;
5822         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5823
5824         my $addr = do { no overloading; pack 'J', $self; };
5825
5826         return 0 if ! $locked{$addr};
5827         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5828         return 1;
5829     }
5830
5831     sub set_file_path { # Set the final directory path for this table
5832         my $self = shift;
5833         # Rest of parameters passed on
5834
5835         no overloading;
5836         @{$file_path{pack 'J', $self}} = @_;
5837         return
5838     }
5839
5840     # Accessors for the range list stored in this table.  First for
5841     # unconditional
5842     for my $sub (qw(
5843                     containing_range
5844                     contains
5845                     count
5846                     each_range
5847                     hash
5848                     is_empty
5849                     matches_identically_to
5850                     max
5851                     min
5852                     range_count
5853                     reset_each_range
5854                     type_of
5855                     value_of
5856                 ))
5857     {
5858         no strict "refs";
5859         *$sub = sub {
5860             use strict "refs";
5861             my $self = shift;
5862             return $self->_range_list->$sub(@_);
5863         }
5864     }
5865
5866     # Then for ones that should fail if locked
5867     for my $sub (qw(
5868                     delete_range
5869                 ))
5870     {
5871         no strict "refs";
5872         *$sub = sub {
5873             use strict "refs";
5874             my $self = shift;
5875
5876             return if $self->carp_if_locked;
5877             no overloading;
5878             return $self->_range_list->$sub(@_);
5879         }
5880     }
5881
5882 } # End closure
5883
5884 package Map_Table;
5885 use base '_Base_Table';
5886
5887 # A Map Table is a table that contains the mappings from code points to
5888 # values.  There are two weird cases:
5889 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5890 #    are written in the table's file at the end of the table nonetheless.  It
5891 #    requires specially constructed code to handle these; utf8.c can not read
5892 #    these in, so they should not go in $map_directory.  As of this writing,
5893 #    the only case that these happen is for named sequences used in
5894 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
5895 #    something else could come along that uses it.
5896 # 2) Specials are anything that doesn't fit syntactically into the body of the
5897 #    table.  The ranges for these have a map type of non-zero.  The code below
5898 #    knows about and handles each possible type.   In most cases, these are
5899 #    written as part of the header.
5900 #
5901 # A map table deliberately can't be manipulated at will unlike match tables.
5902 # This is because of the ambiguities having to do with what to do with
5903 # overlapping code points.  And there just isn't a need for those things;
5904 # what one wants to do is just query, add, replace, or delete mappings, plus
5905 # write the final result.
5906 # However, there is a method to get the list of possible ranges that aren't in
5907 # this table to use for defaulting missing code point mappings.  And,
5908 # map_add_or_replace_non_nulls() does allow one to add another table to this
5909 # one, but it is clearly very specialized, and defined that the other's
5910 # non-null values replace this one's if there is any overlap.
5911
5912 sub trace { return main::trace(@_); }
5913
5914 { # Closure
5915
5916     main::setup_package();
5917
5918     my %default_map;
5919     # Many input files omit some entries; this gives what the mapping for the
5920     # missing entries should be
5921     main::set_access('default_map', \%default_map, 'r');
5922
5923     my %anomalous_entries;
5924     # Things that go in the body of the table which don't fit the normal
5925     # scheme of things, like having a range.  Not much can be done with these
5926     # once there except to output them.  This was created to handle named
5927     # sequences.
5928     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5929     main::set_access('anomalous_entries',       # Append singular, read plural
5930                     \%anomalous_entries,
5931                     'readable_array');
5932
5933     my %to_output_map;
5934     # Enum as to whether or not to write out this map table, and how:
5935     #   0               don't output
5936     #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
5937     #                   it should not be removed nor its format changed.  This
5938     #                   is done for those files that have traditionally been
5939     #                   output.
5940     #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
5941     #                   with this file
5942     #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
5943     #                   outputting the actual mappings as-is, we adjust things
5944     #                   to create a much more compact table. Only those few
5945     #                   tables where the mapping is convertible at least to an
5946     #                   integer and compacting makes a big difference should
5947     #                   have this.  Hence, the default is to not do this
5948     #                   unless the table's default mapping is to $CODE_POINT,
5949     #                   and the range size is not 1.
5950     main::set_access('to_output_map', \%to_output_map, 's');
5951
5952     sub new {
5953         my $class = shift;
5954         my $name = shift;
5955
5956         my %args = @_;
5957
5958         # Optional initialization data for the table.
5959         my $initialize = delete $args{'Initialize'};
5960
5961         my $default_map = delete $args{'Default_Map'};
5962         my $property = delete $args{'_Property'};
5963         my $full_name = delete $args{'Full_Name'};
5964         my $to_output_map = delete $args{'To_Output_Map'};
5965
5966         # Rest of parameters passed on
5967
5968         my $range_list = Range_Map->new(Owner => $property);
5969
5970         my $self = $class->SUPER::new(
5971                                     Name => $name,
5972                                     Complete_Name =>  $full_name,
5973                                     Full_Name => $full_name,
5974                                     _Property => $property,
5975                                     _Range_List => $range_list,
5976                                     %args);
5977
5978         my $addr = do { no overloading; pack 'J', $self; };
5979
5980         $anomalous_entries{$addr} = [];
5981         $default_map{$addr} = $default_map;
5982         $to_output_map{$addr} = $to_output_map;
5983
5984         $self->initialize($initialize) if defined $initialize;
5985
5986         return $self;
5987     }
5988
5989     use overload
5990         fallback => 0,
5991         qw("") => "_operator_stringify",
5992     ;
5993
5994     sub _operator_stringify {
5995         my $self = shift;
5996
5997         my $name = $self->property->full_name;
5998         $name = '""' if $name eq "";
5999         return "Map table for Property '$name'";
6000     }
6001
6002     sub add_alias {
6003         # Add a synonym for this table (which means the property itself)
6004         my $self = shift;
6005         my $name = shift;
6006         # Rest of parameters passed on.
6007
6008         $self->SUPER::add_alias($name, $self->property, @_);
6009         return;
6010     }
6011
6012     sub add_map {
6013         # Add a range of code points to the list of specially-handled code
6014         # points.  $MULTI_CP is assumed if the type of special is not passed
6015         # in.
6016
6017         my $self = shift;
6018         my $lower = shift;
6019         my $upper = shift;
6020         my $string = shift;
6021         my %args = @_;
6022
6023         my $type = delete $args{'Type'} || 0;
6024         # Rest of parameters passed on
6025
6026         # Can't change the table if locked.
6027         return if $self->carp_if_locked;
6028
6029         my $addr = do { no overloading; pack 'J', $self; };
6030
6031         $self->_range_list->add_map($lower, $upper,
6032                                     $string,
6033                                     @_,
6034                                     Type => $type);
6035         return;
6036     }
6037
6038     sub append_to_body {
6039         # Adds to the written HERE document of the table's body any anomalous
6040         # entries in the table..
6041
6042         my $self = shift;
6043         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6044
6045         my $addr = do { no overloading; pack 'J', $self; };
6046
6047         return "" unless @{$anomalous_entries{$addr}};
6048         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6049     }
6050
6051     sub map_add_or_replace_non_nulls {
6052         # This adds the mappings in the table $other to $self.  Non-null
6053         # mappings from $other override those in $self.  It essentially merges
6054         # the two tables, with the second having priority except for null
6055         # mappings.
6056
6057         my $self = shift;
6058         my $other = shift;
6059         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6060
6061         return if $self->carp_if_locked;
6062
6063         if (! $other->isa(__PACKAGE__)) {
6064             Carp::my_carp_bug("$other should be a "
6065                         . __PACKAGE__
6066                         . ".  Not a '"
6067                         . ref($other)
6068                         . "'.  Not added;");
6069             return;
6070         }
6071
6072         my $addr = do { no overloading; pack 'J', $self; };
6073         my $other_addr = do { no overloading; pack 'J', $other; };
6074
6075         local $to_trace = 0 if main::DEBUG;
6076
6077         my $self_range_list = $self->_range_list;
6078         my $other_range_list = $other->_range_list;
6079         foreach my $range ($other_range_list->ranges) {
6080             my $value = $range->value;
6081             next if $value eq "";
6082             $self_range_list->_add_delete('+',
6083                                           $range->start,
6084                                           $range->end,
6085                                           $value,
6086                                           Type => $range->type,
6087                                           Replace => $UNCONDITIONALLY);
6088         }
6089
6090         return;
6091     }
6092
6093     sub set_default_map {
6094         # Define what code points that are missing from the input files should
6095         # map to
6096
6097         my $self = shift;
6098         my $map = shift;
6099         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6100
6101         my $addr = do { no overloading; pack 'J', $self; };
6102
6103         # Convert the input to the standard equivalent, if any (won't have any
6104         # for $STRING properties)
6105         my $standard = $self->_find_table_from_alias->{$map};
6106         $map = $standard->name if defined $standard;
6107
6108         # Warn if there already is a non-equivalent default map for this
6109         # property.  Note that a default map can be a ref, which means that
6110         # what it actually means is delayed until later in the program, and it
6111         # IS permissible to override it here without a message.
6112         my $default_map = $default_map{$addr};
6113         if (defined $default_map
6114             && ! ref($default_map)
6115             && $default_map ne $map
6116             && main::Standardize($map) ne $default_map)
6117         {
6118             my $property = $self->property;
6119             my $map_table = $property->table($map);
6120             my $default_table = $property->table($default_map);
6121             if (defined $map_table
6122                 && defined $default_table
6123                 && $map_table != $default_table)
6124             {
6125                 Carp::my_carp("Changing the default mapping for "
6126                             . $property
6127                             . " from $default_map to $map'");
6128             }
6129         }
6130
6131         $default_map{$addr} = $map;
6132
6133         # Don't also create any missing table for this map at this point,
6134         # because if we did, it could get done before the main table add is
6135         # done for PropValueAliases.txt; instead the caller will have to make
6136         # sure it exists, if desired.
6137         return;
6138     }
6139
6140     sub to_output_map {
6141         # Returns boolean: should we write this map table?
6142
6143         my $self = shift;
6144         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6145
6146         my $addr = do { no overloading; pack 'J', $self; };
6147
6148         # If overridden, use that
6149         return $to_output_map{$addr} if defined $to_output_map{$addr};
6150
6151         my $full_name = $self->full_name;
6152         return $global_to_output_map{$full_name}
6153                                 if defined $global_to_output_map{$full_name};
6154
6155         # If table says to output, do so; if says to suppress it, do so.
6156         my $fate = $self->fate;
6157         return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6158         return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6159         return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6160
6161         my $type = $self->property->type;
6162
6163         # Don't want to output binary map tables even for debugging.
6164         return 0 if $type == $BINARY;
6165
6166         # But do want to output string ones.  All the ones that remain to
6167         # be dealt with (i.e. which haven't explicitly been set to external)
6168         # are for internal Perl use only.  The default for those that map to
6169         # $CODE_POINT and haven't been restricted to a single element range
6170         # is to use the adjusted form.
6171         if ($type == $STRING) {
6172             return $INTERNAL_MAP if $self->range_size_1
6173                                     || $default_map{$addr} ne $CODE_POINT;
6174             return $OUTPUT_ADJUSTED;
6175         }
6176
6177         # Otherwise is an $ENUM, do output it, for Perl's purposes
6178         return $INTERNAL_MAP;
6179     }
6180
6181     sub inverse_list {
6182         # Returns a Range_List that is gaps of the current table.  That is,
6183         # the inversion
6184
6185         my $self = shift;
6186         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6187
6188         my $current = Range_List->new(Initialize => $self->_range_list,
6189                                 Owner => $self->property);
6190         return ~ $current;
6191     }
6192
6193     sub header {
6194         my $self = shift;
6195         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6196
6197         my $return = $self->SUPER::header();
6198
6199         if ($self->to_output_map >= $INTERNAL_MAP) {
6200             $return .= $INTERNAL_ONLY_HEADER;
6201         }
6202         else {
6203             my $property_name = $self->property->full_name =~ s/Legacy_//r;
6204             $return .= <<END;
6205
6206 # !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
6207
6208 # This file is for internal use by core Perl only.  It is retained for
6209 # backwards compatibility with applications that may have come to rely on it,
6210 # but its format and even its name or existence are subject to change without
6211 # notice in a future Perl version.  Don't use it directly.  Instead, its
6212 # contents are now retrievable through a stable API in the Unicode::UCD
6213 # module: Unicode::UCD::prop_invmap('$property_name').
6214 END
6215         }
6216         return $return;
6217     }
6218
6219     sub set_final_comment {
6220         # Just before output, create the comment that heads the file
6221         # containing this table.
6222
6223         return unless $debugging_build;
6224
6225         my $self = shift;
6226         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6227
6228         # No sense generating a comment if aren't going to write it out.
6229         return if ! $self->to_output_map;
6230
6231         my $addr = do { no overloading; pack 'J', $self; };
6232
6233         my $property = $self->property;
6234
6235         # Get all the possible names for this property.  Don't use any that
6236         # aren't ok for use in a file name, etc.  This is perhaps causing that
6237         # flag to do double duty, and may have to be changed in the future to
6238         # have our own flag for just this purpose; but it works now to exclude
6239         # Perl generated synonyms from the lists for properties, where the
6240         # name is always the proper Unicode one.
6241         my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6242
6243         my $count = $self->count;
6244         my $default_map = $default_map{$addr};
6245
6246         # The ranges that map to the default aren't output, so subtract that
6247         # to get those actually output.  A property with matching tables
6248         # already has the information calculated.
6249         if ($property->type != $STRING) {
6250             $count -= $property->table($default_map)->count;
6251         }
6252         elsif (defined $default_map) {
6253
6254             # But for $STRING properties, must calculate now.  Subtract the
6255             # count from each range that maps to the default.
6256             foreach my $range ($self->_range_list->ranges) {
6257                 if ($range->value eq $default_map) {
6258                     $count -= $range->end +1 - $range->start;
6259                 }
6260             }
6261
6262         }
6263
6264         # Get a  string version of $count with underscores in large numbers,
6265         # for clarity.
6266         my $string_count = main::clarify_number($count);
6267
6268         my $code_points = ($count == 1)
6269                         ? 'single code point'
6270                         : "$string_count code points";
6271
6272         my $mapping;
6273         my $these_mappings;
6274         my $are;
6275         if (@property_aliases <= 1) {
6276             $mapping = 'mapping';
6277             $these_mappings = 'this mapping';
6278             $are = 'is'
6279         }
6280         else {
6281             $mapping = 'synonymous mappings';
6282             $these_mappings = 'these mappings';
6283             $are = 'are'
6284         }
6285         my $cp;
6286         if ($count >= $MAX_UNICODE_CODEPOINTS) {
6287             $cp = "any code point in Unicode Version $string_version";
6288         }
6289         else {
6290             my $map_to;
6291             if ($default_map eq "") {
6292                 $map_to = 'the null string';
6293             }
6294             elsif ($default_map eq $CODE_POINT) {
6295                 $map_to = "itself";
6296             }
6297             else {
6298                 $map_to = "'$default_map'";
6299             }
6300             if ($count == 1) {
6301                 $cp = "the single code point";
6302             }
6303             else {
6304                 $cp = "one of the $code_points";
6305             }
6306             $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6307         }
6308
6309         my $comment = "";
6310
6311         my $status = $self->status;
6312         if ($status && $status ne $PLACEHOLDER) {
6313             my $warn = uc $status_past_participles{$status};
6314             $comment .= <<END;
6315
6316 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
6317  All property or property=value combinations contained in this file are $warn.
6318  See $unicode_reference_url for what this means.
6319
6320 END
6321         }
6322         $comment .= "This file returns the $mapping:\n";
6323
6324         my $ucd_accessible_name = "";
6325         my $full_name = $self->property->full_name;
6326         for my $i (0 .. @property_aliases - 1) {
6327             my $name = $property_aliases[$i]->name;
6328             $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
6329             if ($property_aliases[$i]->ucd) {
6330                 if ($name eq $full_name) {
6331                     $ucd_accessible_name = $full_name;
6332                 }
6333                 elsif (! $ucd_accessible_name) {
6334                     $ucd_accessible_name = $name;
6335                 }
6336             }
6337         }
6338         $comment .= "\nwhere 'cp' is $cp.";
6339         if ($ucd_accessible_name) {
6340             $comment .= "  Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
6341         }
6342
6343         # And append any commentary already set from the actual property.
6344         $comment .= "\n\n" . $self->comment if $self->comment;
6345         if ($self->description) {
6346             $comment .= "\n\n" . join " ", $self->description;
6347         }
6348         if ($self->note) {
6349             $comment .= "\n\n" . join " ", $self->note;
6350         }
6351         $comment .= "\n";
6352
6353         if (! $self->perl_extension) {
6354             $comment .= <<END;
6355
6356 For information about what this property really means, see:
6357 $unicode_reference_url
6358 END
6359         }
6360
6361         if ($count) {        # Format differs for empty table
6362                 $comment.= "\nThe format of the ";
6363             if ($self->range_size_1) {
6364                 $comment.= <<END;
6365 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6366 is in hex; MAPPING is what CODE_POINT maps to.
6367 END
6368             }
6369             else {
6370
6371                 # There are tables which end up only having one element per
6372                 # range, but it is not worth keeping track of for making just
6373                 # this comment a little better.
6374                 $comment.= <<END;
6375 non-comment portions of the main body of lines of this file is:
6376 START\\tSTOP\\tMAPPING where START is the starting code point of the
6377 range, in hex; STOP is the ending point, or if omitted, the range has just one
6378 code point; MAPPING is what each code point between START and STOP maps to.
6379 END
6380                 if ($self->output_range_counts) {
6381                     $comment .= <<END;
6382 Numbers in comments in [brackets] indicate how many code points are in the
6383 range (omitted when the range is a single code point or if the mapping is to
6384 the null string).
6385 END
6386                 }
6387             }
6388         }
6389         $self->set_comment(main::join_lines($comment));
6390         return;
6391     }
6392
6393     my %swash_keys; # Makes sure don't duplicate swash names.
6394
6395     # The remaining variables are temporaries used while writing each table,
6396     # to output special ranges.
6397     my @multi_code_point_maps;  # Map is to more than one code point.
6398
6399     sub handle_special_range {
6400         # Called in the middle of write when it finds a range it doesn't know
6401         # how to handle.
6402
6403         my $self = shift;
6404         my $range = shift;
6405         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6406
6407         my $addr = do { no overloading; pack 'J', $self; };
6408
6409         my $type = $range->type;
6410
6411         my $low = $range->start;
6412         my $high = $range->end;
6413         my $map = $range->value;
6414
6415         # No need to output the range if it maps to the default.
6416         return if $map eq $default_map{$addr};
6417
6418         my $property = $self->property;
6419
6420         # Switch based on the map type...
6421         if ($type == $HANGUL_SYLLABLE) {
6422
6423             # These are entirely algorithmically determinable based on
6424             # some constants furnished by Unicode; for now, just set a
6425             # flag to indicate that have them.  After everything is figured
6426             # out, we will output the code that does the algorithm.  (Don't
6427             # output them if not needed because we are suppressing this
6428             # property.)
6429             $has_hangul_syllables = 1 if $property->to_output_map;
6430         }
6431         elsif ($type == $CP_IN_NAME) {
6432
6433             # Code points whose name ends in their code point are also
6434             # algorithmically determinable, but need information about the map
6435             # to do so.  Both the map and its inverse are stored in data
6436             # structures output in the file.  They are stored in the mean time
6437             # in global lists The lists will be written out later into Name.pm,
6438             # which is created only if needed.  In order to prevent duplicates
6439             # in the list, only add to them for one property, should multiple
6440             # ones need them.
6441             if ($needing_code_points_ending_in_code_point == 0) {
6442                 $needing_code_points_ending_in_code_point = $property;
6443             }
6444             if ($property == $needing_code_points_ending_in_code_point) {
6445                 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6446                 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6447
6448                 my $squeezed = $map =~ s/[-\s]+//gr;
6449                 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6450                                                                           $low;
6451                 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6452                                                                          $high;
6453
6454                 push @code_points_ending_in_code_point, { low => $low,
6455                                                         high => $high,
6456                                                         name => $map
6457                                                         };
6458             }
6459         }
6460         elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6461
6462             # Multi-code point maps and null string maps have an entry
6463             # for each code point in the range.  They use the same
6464             # output format.
6465             for my $code_point ($low .. $high) {
6466
6467                 # The pack() below can't cope with surrogates.  XXX This may
6468                 # no longer be true
6469                 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
6470                     Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
6471                     next;
6472                 }
6473
6474                 # Generate the hash entries for these in the form that
6475                 # utf8.c understands.
6476                 my $tostr = "";
6477                 my $to_name = "";
6478                 my $to_chr = "";
6479                 foreach my $to (split " ", $map) {
6480                     if ($to !~ /^$code_point_re$/) {
6481                         Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
6482                         next;
6483                     }
6484                     $tostr .= sprintf "\\x{%s}", $to;
6485                     $to = CORE::hex $to;
6486                     if ($annotate) {
6487                         $to_name .= " + " if $to_name;
6488                         $to_chr .= chr($to);
6489                         main::populate_char_info($to)
6490                                             if ! defined $viacode[$to];
6491                         $to_name .=  $viacode[$to];
6492                     }
6493                 }
6494
6495                 # I (khw) have never waded through this line to
6496                 # understand it well enough to comment it.
6497                 my $utf8 = sprintf(qq["%s" => "$tostr",],
6498                         join("", map { sprintf "\\x%02X", $_ }
6499                             unpack("U0C*", pack("U", $code_point))));
6500
6501                 # Add a comment so that a human reader can more easily
6502                 # see what's going on.
6503                 push @multi_code_point_maps,
6504                         sprintf("%-45s # U+%04X", $utf8, $code_point);
6505                 if (! $annotate) {
6506                     $multi_code_point_maps[-1] .= " => $map";
6507                 }
6508                 else {
6509                     main::populate_char_info($code_point)
6510                                     if ! defined $viacode[$code_point];
6511                     $multi_code_point_maps[-1] .= " '"
6512                         . chr($code_point)
6513                         . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6514                 }
6515             }
6516         }
6517         else {
6518             Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
6519         }
6520
6521         return;
6522     }
6523
6524     sub pre_body {
6525         # Returns the string that should be output in the file before the main
6526         # body of this table.  It isn't called until the main body is
6527         # calculated, saving a pass.  The string includes some hash entries
6528         # identifying the format of the body, and what the single value should
6529         # be for all ranges missing from it.  It also includes any code points
6530         # which have map_types that don't go in the main table.
6531
6532         my $self = shift;
6533         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6534
6535         my $addr = do { no overloading; pack 'J', $self; };
6536
6537         my $name = $self->property->swash_name;
6538
6539         # Currently there is nothing in the pre_body unless a swash is being
6540         # generated.
6541         return unless defined $name;
6542
6543         if (defined $swash_keys{$name}) {
6544             Carp::my_carp(main::join_lines(<<END
6545 Already created a swash name '$name' for $swash_keys{$name}.  This means that
6546 the same name desired for $self shouldn't be used.  Bad News.  This must be
6547 fixed before production use, but proceeding anyway
6548 END
6549             ));
6550         }
6551         $swash_keys{$name} = "$self";
6552
6553         my $pre_body = "";
6554
6555         # Here we assume we were called after have gone through the whole
6556         # file.  If we actually generated anything for each map type, add its
6557         # respective header and trailer
6558         my $specials_name = "";
6559         if (@multi_code_point_maps) {
6560             $specials_name = "utf8::ToSpec$name";
6561             $pre_body .= <<END;
6562
6563 # Some code points require special handling because their mappings are each to
6564 # multiple code points.  These do not appear in the main body, but are defined
6565 # in the hash below.
6566
6567 # Each key is the string of N bytes that together make up the UTF-8 encoding
6568 # for the code point.  (i.e. the same as looking at the code point's UTF-8
6569 # under "use bytes").  Each value is the UTF-8 of the translation, for speed.
6570 \%$specials_name = (
6571 END
6572             $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6573         }
6574
6575         my $format = $self->format;
6576
6577         my $return = "";
6578
6579         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6580         if ($output_adjusted) {
6581             if ($specials_name) {
6582                 $return .= <<END;
6583 # The mappings in the non-hash portion of this file must be modified to get the
6584 # correct values by adding the code point ordinal number to each one that is
6585 # numeric.
6586 END
6587             }
6588             else {
6589                 $return .= <<END;
6590 # The mappings must be modified to get the correct values by adding the code
6591 # point ordinal number to each one that is numeric.
6592 END
6593             }
6594         }
6595
6596         $return .= <<END;
6597
6598 # The name this swash is to be known by, with the format of the mappings in
6599 # the main body of the table, and what all code points missing from this file
6600 # map to.
6601 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6602 END
6603         if ($specials_name) {
6604             $return .= <<END;
6605 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6606 END
6607         }
6608         my $default_map = $default_map{$addr};
6609
6610         # For $CODE_POINT default maps and using adjustments, instead the default
6611         # becomes zero.
6612         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
6613                 .  (($output_adjusted && $default_map eq $CODE_POINT)
6614                    ? "0"
6615                    : $default_map)
6616                 . "';";
6617
6618         if ($default_map eq $CODE_POINT) {
6619             $return .= ' # code point maps to itself';
6620         }
6621         elsif ($default_map eq "") {
6622             $return .= ' # code point maps to the null string';
6623         }
6624         $return .= "\n";
6625
6626         $return .= $pre_body;
6627
6628         return $return;
6629     }
6630
6631     sub write {
6632         # Write the table to the file.
6633
6634         my $self = shift;
6635         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6636
6637         my $addr = do { no overloading; pack 'J', $self; };
6638
6639         # Clear the temporaries
6640         undef @multi_code_point_maps;
6641
6642         # Calculate the format of the table if not already done.
6643         my $format = $self->format;
6644         my $type = $self->property->type;
6645         my $default_map = $self->default_map;
6646         if (! defined $format) {
6647             if ($type == $BINARY) {
6648
6649                 # Don't bother checking the values, because we elsewhere
6650                 # verify that a binary table has only 2 values.
6651                 $format = $BINARY_FORMAT;
6652             }
6653             else {
6654                 my @ranges = $self->_range_list->ranges;
6655
6656                 # default an empty table based on its type and default map
6657                 if (! @ranges) {
6658
6659                     # But it turns out that the only one we can say is a
6660                     # non-string (besides binary, handled above) is when the
6661                     # table is a string and the default map is to a code point
6662                     if ($type == $STRING && $default_map eq $CODE_POINT) {
6663                         $format = $HEX_FORMAT;
6664                     }
6665                     else {
6666                         $format = $STRING_FORMAT;
6667                     }
6668                 }
6669                 else {
6670
6671                     # Start with the most restrictive format, and as we find
6672                     # something that doesn't fit with that, change to the next
6673                     # most restrictive, and so on.
6674                     $format = $DECIMAL_FORMAT;
6675                     foreach my $range (@ranges) {
6676                         next if $range->type != 0;  # Non-normal ranges don't
6677                                                     # affect the main body
6678                         my $map = $range->value;
6679                         if ($map ne $default_map) {
6680                             last if $format eq $STRING_FORMAT;  # already at
6681                                                                 # least
6682                                                                 # restrictive
6683                             $format = $INTEGER_FORMAT
6684                                                 if $format eq $DECIMAL_FORMAT
6685                                                     && $map !~ / ^ [0-9] $ /x;
6686                             $format = $FLOAT_FORMAT
6687                                             if $format eq $INTEGER_FORMAT
6688                                                 && $map !~ / ^ -? [0-9]+ $ /x;
6689                             $format = $RATIONAL_FORMAT
6690                                 if $format eq $FLOAT_FORMAT
6691                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6692                             $format = $HEX_FORMAT
6693                                 if ($format eq $RATIONAL_FORMAT
6694                                        && $map !~
6695                                            m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
6696                                         # Assume a leading zero means hex,
6697                                         # even if all digits are 0-9
6698                                     || ($format eq $INTEGER_FORMAT
6699                                         && $map =~ /^0[0-9A-F]/);
6700                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6701                                                        && $map =~ /[^0-9A-F]/;
6702                         }
6703                     }
6704                 }
6705             }
6706         } # end of calculating format
6707
6708         if ($default_map eq $CODE_POINT
6709             && $format ne $HEX_FORMAT
6710             && ! defined $self->format)    # manual settings are always
6711                                            # considered ok
6712         {
6713             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6714         }
6715
6716         # If the output is to be adjusted, the format of the table that gets
6717         # output is actually 'a' instead of whatever it is stored internally
6718         # as.
6719         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6720         if ($output_adjusted) {
6721             $format = $ADJUST_FORMAT;
6722         }
6723
6724         $self->_set_format($format);
6725
6726         return $self->SUPER::write(
6727             $output_adjusted,
6728             ($self->property == $block)
6729                 ? 7     # block file needs more tab stops
6730                 : 3,
6731             $default_map);   # don't write defaulteds
6732     }
6733
6734     # Accessors for the underlying list that should fail if locked.
6735     for my $sub (qw(
6736                     add_duplicate
6737                 ))
6738     {
6739         no strict "refs";
6740         *$sub = sub {
6741             use strict "refs";
6742             my $self = shift;
6743
6744             return if $self->carp_if_locked;
6745             return $self->_range_list->$sub(@_);
6746         }
6747     }
6748 } # End closure for Map_Table
6749
6750 package Match_Table;
6751 use base '_Base_Table';
6752
6753 # A Match table is one which is a list of all the code points that have
6754 # the same property and property value, for use in \p{property=value}
6755 # constructs in regular expressions.  It adds very little data to the base
6756 # structure, but many methods, as these lists can be combined in many ways to
6757 # form new ones.
6758 # There are only a few concepts added:
6759 # 1) Equivalents and Relatedness.
6760 #    Two tables can match the identical code points, but have different names.
6761 #    This always happens when there is a perl single form extension
6762 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
6763 #    tables are set to be related, with the Perl extension being a child, and
6764 #    the Unicode property being the parent.
6765 #
6766 #    It may be that two tables match the identical code points and we don't
6767 #    know if they are related or not.  This happens most frequently when the
6768 #    Block and Script properties have the exact range.  But note that a
6769 #    revision to Unicode could add new code points to the script, which would
6770 #    now have to be in a different block (as the block was filled, or there
6771 #    would have been 'Unknown' script code points in it and they wouldn't have
6772 #    been identical).  So we can't rely on any two properties from Unicode
6773 #    always matching the same code points from release to release, and thus
6774 #    these tables are considered coincidentally equivalent--not related.  When
6775 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
6776 #    'leader', and the others are 'equivalents'.  This concept is useful
6777 #    to minimize the number of tables written out.  Only one file is used for
6778 #    any identical set of code points, with entries in Heavy.pl mapping all
6779 #    the involved tables to it.
6780 #
6781 #    Related tables will always be identical; we set them up to be so.  Thus
6782 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
6783 #    unrelated tables.  Relatedness makes generating the documentation easier.
6784 #
6785 # 2) Complement.
6786 #    Like equivalents, two tables may be the inverses of each other, the
6787 #    intersection between them is null, and the union is every Unicode code
6788 #    point.  The two tables that occupy a binary property are necessarily like
6789 #    this.  By specifying one table as the complement of another, we can avoid
6790 #    storing it on disk (using the other table and performing a fast
6791 #    transform), and some memory and calculations.
6792 #
6793 # 3) Conflicting.  It may be that there will eventually be name clashes, with
6794 #    the same name meaning different things.  For a while, there actually were
6795 #    conflicts, but they have so far been resolved by changing Perl's or
6796 #    Unicode's definitions to match the other, but when this code was written,
6797 #    it wasn't clear that that was what was going to happen.  (Unicode changed
6798 #    because of protests during their beta period.)  Name clashes are warned
6799 #    about during compilation, and the documentation.  The generated tables
6800 #    are sane, free of name clashes, because the code suppresses the Perl
6801 #    version.  But manual intervention to decide what the actual behavior
6802 #    should be may be required should this happen.  The introductory comments
6803 #    have more to say about this.
6804
6805 sub standardize { return main::standardize($_[0]); }
6806 sub trace { return main::trace(@_); }
6807
6808
6809 { # Closure
6810
6811     main::setup_package();
6812
6813     my %leader;
6814     # The leader table of this one; initially $self.
6815     main::set_access('leader', \%leader, 'r');
6816
6817     my %equivalents;
6818     # An array of any tables that have this one as their leader
6819     main::set_access('equivalents', \%equivalents, 'readable_array');
6820
6821     my %parent;
6822     # The parent table to this one, initially $self.  This allows us to
6823     # distinguish between equivalent tables that are related (for which this
6824     # is set to), and those which may not be, but share the same output file
6825     # because they match the exact same set of code points in the current
6826     # Unicode release.
6827     main::set_access('parent', \%parent, 'r');
6828
6829     my %children;
6830     # An array of any tables that have this one as their parent
6831     main::set_access('children', \%children, 'readable_array');
6832
6833     my %conflicting;
6834     # Array of any tables that would have the same name as this one with
6835     # a different meaning.  This is used for the generated documentation.
6836     main::set_access('conflicting', \%conflicting, 'readable_array');
6837
6838     my %matches_all;
6839     # Set in the constructor for tables that are expected to match all code
6840     # points.
6841     main::set_access('matches_all', \%matches_all, 'r');
6842
6843     my %complement;
6844     # Points to the complement that this table is expressed in terms of; 0 if
6845     # none.
6846     main::set_access('complement', \%complement, 'r');
6847
6848     sub new {
6849         my $class = shift;
6850
6851         my %args = @_;
6852
6853         # The property for which this table is a listing of property values.
6854         my $property = delete $args{'_Property'};
6855
6856         my $name = delete $args{'Name'};
6857         my $full_name = delete $args{'Full_Name'};
6858         $full_name = $name if ! defined $full_name;
6859
6860         # Optional
6861         my $initialize = delete $args{'Initialize'};
6862         my $matches_all = delete $args{'Matches_All'} || 0;
6863         my $format = delete $args{'Format'};
6864         # Rest of parameters passed on.
6865
6866         my $range_list = Range_List->new(Initialize => $initialize,
6867                                          Owner => $property);
6868
6869         my $complete = $full_name;
6870         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
6871                                               # but this helps debug if it
6872                                               # does
6873         # The complete name for a match table includes it's property in a
6874         # compound form 'property=table', except if the property is the
6875         # pseudo-property, perl, in which case it is just the single form,
6876         # 'table' (If you change the '=' must also change the ':' in lots of
6877         # places in this program that assume an equal sign)
6878         $complete = $property->full_name . "=$complete" if $property != $perl;
6879
6880         my $self = $class->SUPER::new(%args,
6881                                       Name => $name,
6882                                       Complete_Name => $complete,
6883                                       Full_Name => $full_name,
6884                                       _Property => $property,
6885                                       _Range_List => $range_list,
6886                                       Format => $EMPTY_FORMAT,
6887                                       );
6888         my $addr = do { no overloading; pack 'J', $self; };
6889
6890         $conflicting{$addr} = [ ];
6891         $equivalents{$addr} = [ ];
6892         $children{$addr} = [ ];
6893         $matches_all{$addr} = $matches_all;
6894         $leader{$addr} = $self;
6895         $parent{$addr} = $self;
6896         $complement{$addr} = 0;
6897
6898         if (defined $format && $format ne $EMPTY_FORMAT) {
6899             Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
6900         }
6901
6902         return $self;
6903     }
6904
6905     # See this program's beginning comment block about overloading these.
6906     use overload
6907         fallback => 0,
6908         qw("") => "_operator_stringify",
6909         '=' => sub {
6910                     my $self = shift;
6911
6912                     return if $self->carp_if_locked;
6913                     return $self;
6914                 },
6915
6916         '+' => sub {
6917                         my $self = shift;
6918                         my $other = shift;
6919
6920                         return $self->_range_list + $other;
6921                     },
6922         '&' => sub {
6923                         my $self = shift;
6924                         my $other = shift;
6925
6926                         return $self->_range_list & $other;
6927                     },
6928         '+=' => sub {
6929                         my $self = shift;
6930                         my $other = shift;
6931                         my $reversed = shift;
6932
6933                         if ($reversed) {
6934                             Carp::my_carp_bug("Bad news.  Can't cope with '"
6935                             . ref($other)
6936                             . ' += '
6937                             . ref($self)
6938                             . "'.  undef returned.");
6939                             return;
6940                         }
6941
6942                         return if $self->carp_if_locked;
6943
6944                         my $addr = do { no overloading; pack 'J', $self; };
6945
6946                         if (ref $other) {
6947
6948                             # Change the range list of this table to be the
6949                             # union of the two.
6950                             $self->_set_range_list($self->_range_list
6951                                                     + $other);
6952                         }
6953                         else {    # $other is just a simple value
6954                             $self->add_range($other, $other);
6955                         }
6956                         return $self;
6957                     },
6958         '&=' => sub {
6959                         my $self = shift;
6960                         my $other = shift;
6961                         my $reversed = shift;
6962
6963                         if ($reversed) {
6964                             Carp::my_carp_bug("Bad news.  Can't cope with '"
6965                             . ref($other)
6966                             . ' &= '
6967                             . ref($self)
6968                             . "'.  undef returned.");
6969                             return;
6970                         }
6971
6972                         return if $self->carp_if_locked;
6973                         $self->_set_range_list($self->_range_list & $other);
6974                         return $self;
6975                     },
6976         '-' => sub { my $self = shift;
6977                     my $other = shift;
6978                     my $reversed = shift;
6979                     if ($reversed) {
6980                         Carp::my_carp_bug("Bad news.  Can't cope with '"
6981                         . ref($other)
6982                         . ' - '
6983                         . ref($self)
6984                         . "'.  undef returned.");
6985                         return;
6986                     }
6987
6988                     return $self->_range_list - $other;
6989                 },
6990         '~' => sub { my $self = shift;
6991                     return ~ $self->_range_list;
6992                 },
6993     ;
6994
6995     sub _operator_stringify {
6996         my $self = shift;
6997
6998         my $name = $self->complete_name;
6999         return "Table '$name'";
7000     }
7001
7002     sub _range_list {
7003         # Returns the range list associated with this table, which will be the
7004         # complement's if it has one.
7005
7006         my $self = shift;
7007         my $complement;
7008         if (($complement = $self->complement) != 0) {
7009             return ~ $complement->_range_list;
7010         }
7011         else {
7012             return $self->SUPER::_range_list;
7013         }
7014     }
7015
7016     sub add_alias {
7017         # Add a synonym for this table.  See the comments in the base class
7018
7019         my $self = shift;
7020         my $name = shift;
7021         # Rest of parameters passed on.
7022
7023         $self->SUPER::add_alias($name, $self, @_);
7024         return;
7025     }
7026
7027     sub add_conflicting {
7028         # Add the name of some other object to the list of ones that name
7029         # clash with this match table.
7030
7031         my $self = shift;
7032         my $conflicting_name = shift;   # The name of the conflicting object
7033         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
7034         my $conflicting_object = shift; # Optional, the conflicting object
7035                                         # itself.  This is used to
7036                                         # disambiguate the text if the input
7037                                         # name is identical to any of the
7038                                         # aliases $self is known by.
7039                                         # Sometimes the conflicting object is
7040                                         # merely hypothetical, so this has to
7041                                         # be an optional parameter.
7042         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7043
7044         my $addr = do { no overloading; pack 'J', $self; };
7045
7046         # Check if the conflicting name is exactly the same as any existing
7047         # alias in this table (as long as there is a real object there to
7048         # disambiguate with).
7049         if (defined $conflicting_object) {
7050             foreach my $alias ($self->aliases) {
7051                 if ($alias->name eq $conflicting_name) {
7052
7053                     # Here, there is an exact match.  This results in
7054                     # ambiguous comments, so disambiguate by changing the
7055                     # conflicting name to its object's complete equivalent.
7056                     $conflicting_name = $conflicting_object->complete_name;
7057                     last;
7058                 }
7059             }
7060         }
7061
7062         # Convert to the \p{...} final name
7063         $conflicting_name = "\\$p" . "{$conflicting_name}";
7064
7065         # Only add once
7066         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7067
7068         push @{$conflicting{$addr}}, $conflicting_name;
7069
7070         return;
7071     }
7072
7073     sub is_set_equivalent_to {
7074         # Return boolean of whether or not the other object is a table of this
7075         # type and has been marked equivalent to this one.
7076
7077         my $self = shift;
7078         my $other = shift;
7079         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7080
7081         return 0 if ! defined $other; # Can happen for incomplete early
7082                                       # releases
7083         unless ($other->isa(__PACKAGE__)) {
7084             my $ref_other = ref $other;
7085             my $ref_self = ref $self;
7086             Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
7087             return 0;
7088         }
7089
7090         # Two tables are equivalent if they have the same leader.
7091         no overloading;
7092         return $leader{pack 'J', $self} == $leader{pack 'J', $other};
7093         return;
7094     }
7095
7096     sub set_equivalent_to {
7097         # Set $self equivalent to the parameter table.
7098         # The required Related => 'x' parameter is a boolean indicating
7099         # whether these tables are related or not.  If related, $other becomes
7100         # the 'parent' of $self; if unrelated it becomes the 'leader'
7101         #
7102         # Related tables share all characteristics except names; equivalents
7103         # not quite so many.
7104         # If they are related, one must be a perl extension.  This is because
7105         # we can't guarantee that Unicode won't change one or the other in a
7106         # later release even if they are identical now.
7107
7108         my $self = shift;
7109         my $other = shift;
7110
7111         my %args = @_;
7112         my $related = delete $args{'Related'};
7113
7114         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7115
7116         return if ! defined $other;     # Keep on going; happens in some early
7117                                         # Unicode releases.
7118
7119         if (! defined $related) {
7120             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
7121             $related = 0;
7122         }
7123
7124         # If already are equivalent, no need to re-do it;  if subroutine
7125         # returns null, it found an error, also do nothing
7126         my $are_equivalent = $self->is_set_equivalent_to($other);
7127         return if ! defined $are_equivalent || $are_equivalent;
7128
7129         my $addr = do { no overloading; pack 'J', $self; };
7130         my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7131
7132         if ($related) {
7133             if ($current_leader->perl_extension) {
7134                 if ($other->perl_extension) {
7135                     Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7136                     return;
7137                 }
7138             } elsif ($self->property != $other->property    # Depending on
7139                                                             # situation, might
7140                                                             # be better to use
7141                                                             # add_alias()
7142                                                             # instead for same
7143                                                             # property
7144                      && ! $other->perl_extension)
7145             {
7146                 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
7147                 $related = 0;
7148             }
7149         }
7150
7151         if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7152             Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
7153             return;
7154         }
7155
7156         my $leader = do { no overloading; pack 'J', $current_leader; };
7157         my $other_addr = do { no overloading; pack 'J', $other; };
7158
7159         # Any tables that are equivalent to or children of this table must now
7160         # instead be equivalent to or (children) to the new leader (parent),
7161         # still equivalent.  The equivalency includes their matches_all info,
7162         # and for related tables, their fate and status.
7163         # All related tables are of necessity equivalent, but the converse
7164         # isn't necessarily true
7165         my $status = $other->status;
7166         my $status_info = $other->status_info;
7167         my $fate = $other->fate;
7168         my $matches_all = $matches_all{other_addr};
7169         my $caseless_equivalent = $other->caseless_equivalent;
7170         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7171             next if $table == $other;
7172             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7173
7174             my $table_addr = do { no overloading; pack 'J', $table; };
7175             $leader{$table_addr} = $other;
7176             $matches_all{$table_addr} = $matches_all;
7177             $self->_set_range_list($other->_range_list);
7178             push @{$equivalents{$other_addr}}, $table;
7179             if ($related) {
7180                 $parent{$table_addr} = $other;
7181                 push @{$children{$other_addr}}, $table;
7182                 $table->set_status($status, $status_info);
7183
7184                 # This reason currently doesn't get exposed outside; otherwise
7185                 # would have to look up the parent's reason and use it instead.
7186                 $table->set_fate($fate, "Parent's fate");
7187
7188                 $self->set_caseless_equivalent($caseless_equivalent);
7189             }
7190         }
7191
7192         # Now that we've declared these to be equivalent, any changes to one
7193         # of the tables would invalidate that equivalency.
7194         $self->lock;
7195         $other->lock;
7196         return;
7197     }
7198
7199     sub set_complement {
7200         # Set $self to be the complement of the parameter table.  $self is
7201         # locked, as what it contains should all come from the other table.
7202
7203         my $self = shift;
7204         my $other = shift;
7205
7206         my %args = @_;
7207         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7208
7209         if ($other->complement != 0) {
7210             Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7211             return;
7212         }
7213         my $addr = do { no overloading; pack 'J', $self; };
7214         $complement{$addr} = $other;
7215         $self->lock;
7216         return;
7217     }
7218
7219     sub add_range { # Add a range to the list for this table.
7220         my $self = shift;
7221         # Rest of parameters passed on
7222
7223         return if $self->carp_if_locked;
7224         return $self->_range_list->add_range(@_);
7225     }
7226
7227     sub header {
7228         my $self = shift;
7229         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7230
7231         # All match tables are to be used only by the Perl core.
7232         return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
7233     }
7234
7235     sub pre_body {  # Does nothing for match tables.
7236         return
7237     }
7238
7239     sub append_to_body {  # Does nothing for match tables.
7240         return
7241     }
7242
7243     sub set_fate {
7244         my $self = shift;
7245         my $fate = shift;
7246         my $reason = shift;
7247         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7248
7249         $self->SUPER::set_fate($fate, $reason);
7250
7251         # All children share this fate
7252         foreach my $child ($self->children) {
7253             $child->set_fate($fate, $reason);
7254         }
7255         return;
7256     }
7257
7258     sub write {
7259         my $self = shift;
7260         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7261
7262         return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops
7263     }
7264
7265     sub set_final_comment {
7266         # This creates a comment for the file that is to hold the match table
7267         # $self.  It is somewhat convoluted to make the English read nicely,
7268         # but, heh, it's just a comment.
7269         # This should be called only with the leader match table of all the
7270         # ones that share the same file.  It lists all such tables, ordered so
7271         # that related ones are together.
7272
7273         return unless $debugging_build;
7274
7275         my $leader = shift;   # Should only be called on the leader table of
7276                               # an equivalent group
7277         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7278
7279         my $addr = do { no overloading; pack 'J', $leader; };
7280
7281         if ($leader{$addr} != $leader) {
7282             Carp::my_carp_bug(<<END
7283 set_final_comment() must be called on a leader table, which $leader is not.
7284 It is equivalent to $leader{$addr}.  No comment created
7285 END
7286             );
7287             return;
7288         }
7289
7290         # Get the number of code points matched by each of the tables in this
7291         # file, and add underscores for clarity.
7292         my $count = $leader->count;
7293         my $string_count = main::clarify_number($count);
7294
7295         my $loose_count = 0;        # how many aliases loosely matched
7296         my $compound_name = "";     # ? Are any names compound?, and if so, an
7297                                     # example
7298         my $properties_with_compound_names = 0;    # count of these
7299
7300
7301         my %flags;              # The status flags used in the file
7302         my $total_entries = 0;  # number of entries written in the comment
7303         my $matches_comment = ""; # The portion of the comment about the
7304                                   # \p{}'s
7305         my @global_comments;    # List of all the tables' comments that are
7306                                 # there before this routine was called.
7307         my $has_ucd_alias = 0;  # If there is an alias that is accessible via
7308                                 # Unicode::UCD.  If not, then don't say it is
7309                                 # in the comment
7310
7311         # Get list of all the parent tables that are equivalent to this one
7312         # (including itself).
7313         my @parents = grep { $parent{main::objaddr $_} == $_ }
7314                             main::uniques($leader, @{$equivalents{$addr}});
7315         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
7316                                               # tables
7317
7318         for my $parent (@parents) {
7319
7320             my $property = $parent->property;
7321
7322             # Special case 'N' tables in properties with two match tables when
7323             # the other is a 'Y' one.  These are likely to be binary tables,
7324             # but not necessarily.  In either case, \P{} will match the
7325             # complement of \p{}, and so if something is a synonym of \p, the
7326             # complement of that something will be the synonym of \P.  This
7327             # would be true of any property with just two match tables, not
7328             # just those whose values are Y and N; but that would require a
7329             # little extra work, and there are none such so far in Unicode.
7330             my $perl_p = 'p';        # which is it?  \p{} or \P{}
7331             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
7332
7333             if (scalar $property->tables == 2
7334                 && $parent == $property->table('N')
7335                 && defined (my $yes = $property->table('Y')))
7336             {
7337                 my $yes_addr = do { no overloading; pack 'J', $yes; };
7338                 @yes_perl_synonyms
7339                     = grep { $_->property == $perl }
7340                                     main::uniques($yes,
7341                                                 $parent{$yes_addr},
7342                                                 $parent{$yes_addr}->children);
7343
7344                 # But these synonyms are \P{} ,not \p{}
7345                 $perl_p = 'P';
7346             }
7347
7348             my @description;        # Will hold the table description
7349             my @note;               # Will hold the table notes.
7350             my @conflicting;        # Will hold the table conflicts.
7351
7352             # Look at the parent, any yes synonyms, and all the children
7353             my $parent_addr = do { no overloading; pack 'J', $parent; };
7354             for my $table ($parent,
7355                            @yes_perl_synonyms,
7356                            @{$children{$parent_addr}})
7357             {
7358                 my $table_addr = do { no overloading; pack 'J', $table; };
7359                 my $table_property = $table->property;
7360
7361                 # Tables are separated by a blank line to create a grouping.
7362                 $matches_comment .= "\n" if $matches_comment;
7363
7364                 # The table is named based on the property and value
7365                 # combination it is for, like script=greek.  But there may be
7366                 # a number of synonyms for each side, like 'sc' for 'script',
7367                 # and 'grek' for 'greek'.  Any combination of these is a valid
7368                 # name for this table.  In this case, there are three more,
7369                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
7370                 # listing all possible combinations in the comment, we make
7371                 # sure that each synonym occurs at least once, and add
7372                 # commentary that the other combinations are possible.
7373                 # Because regular expressions don't recognize things like
7374                 # \p{jsn=}, only look at non-null right-hand-sides
7375                 my @property_aliases = $table_property->aliases;
7376                 my @table_aliases = grep { $_->name ne "" } $table->aliases;
7377
7378                 # The alias lists above are already ordered in the order we
7379                 # want to output them.  To ensure that each synonym is listed,
7380                 # we must use the max of the two numbers.  But if there are no
7381                 # legal synonyms (nothing in @table_aliases), then we don't
7382                 # list anything.
7383                 my $listed_combos = (@table_aliases)
7384                                     ?  main::max(scalar @table_aliases,
7385                                                  scalar @property_aliases)
7386                                     : 0;
7387                 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7388
7389
7390                 my $property_had_compound_name = 0;
7391
7392                 for my $i (0 .. $listed_combos - 1) {
7393                     $total_entries++;
7394
7395                     # The current alias for the property is the next one on
7396                     # the list, or if beyond the end, start over.  Similarly
7397                     # for the table (\p{prop=table})
7398                     my $property_alias = $property_aliases
7399                                             [$i % @property_aliases]->name;
7400                     my $table_alias_object = $table_aliases
7401                                                         [$i % @table_aliases];
7402                     my $table_alias = $table_alias_object->name;
7403                     my $loose_match = $table_alias_object->loose_match;
7404                     $has_ucd_alias |= $table_alias_object->ucd;
7405
7406                     if ($table_alias !~ /\D/) { # Clarify large numbers.
7407                         $table_alias = main::clarify_number($table_alias)
7408                     }
7409
7410                     # Add a comment for this alias combination
7411                     my $current_match_comment;
7412                     if ($table_property == $perl) {
7413                         $current_match_comment = "\\$perl_p"
7414                                                     . "{$table_alias}";
7415                     }
7416                     else {
7417                         $current_match_comment
7418                                         = "\\p{$property_alias=$table_alias}";
7419                         $property_had_compound_name = 1;
7420                     }
7421
7422                     # Flag any abnormal status for this table.
7423                     my $flag = $property->status
7424                                 || $table->status
7425                                 || $table_alias_object->status;
7426                     if ($flag && $flag ne $PLACEHOLDER) {
7427                         $flags{$flag} = $status_past_participles{$flag};
7428                     }
7429
7430                     $loose_count++;
7431
7432                     # Pretty up the comment.  Note the \b; it says don't make
7433                     # this line a continuation.
7434                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
7435                                         $flag,
7436                                         " " x 7,
7437                                         $current_match_comment);
7438                 } # End of generating the entries for this table.
7439
7440                 # Save these for output after this group of related tables.
7441                 push @description, $table->description;
7442                 push @note, $table->note;
7443                 push @conflicting, $table->conflicting;
7444
7445                 # And this for output after all the tables.
7446                 push @global_comments, $table->comment;
7447
7448                 # Compute an alternate compound name using the final property
7449                 # synonym and the first table synonym with a colon instead of
7450                 # the equal sign used elsewhere.
7451                 if ($property_had_compound_name) {
7452                     $properties_with_compound_names ++;
7453                     if (! $compound_name || @property_aliases > 1) {
7454                         $compound_name = $property_aliases[-1]->name
7455                                         . ': '
7456                                         . $table_aliases[0]->name;
7457                     }
7458                 }
7459             } # End of looping through all children of this table
7460
7461             # Here have assembled in $matches_comment all the related tables
7462             # to the current parent (preceded by the same info for all the
7463             # previous parents).  Put out information that applies to all of
7464             # the current family.
7465             if (@conflicting) {
7466
7467                 # But output the conflicting information now, as it applies to
7468                 # just this table.
7469                 my $conflicting = join ", ", @conflicting;
7470                 if ($conflicting) {
7471                     $matches_comment .= <<END;
7472
7473     Note that contrary to what you might expect, the above is NOT the same as
7474 END
7475                     $matches_comment .= "any of: " if @conflicting > 1;
7476                     $matches_comment .= "$conflicting\n";
7477                 }
7478             }
7479             if (@description) {
7480                 $matches_comment .= "\n    Meaning: "
7481                                     . join('; ', @description)
7482                                     . "\n";
7483             }
7484             if (@note) {
7485                 $matches_comment .= "\n    Note: "
7486                                     . join("\n    ", @note)
7487                                     . "\n";
7488             }
7489         } # End of looping through all tables
7490
7491
7492         my $code_points;
7493         my $match;
7494         my $any_of_these;
7495         if ($count == 1) {
7496             $match = 'matches';
7497             $code_points = 'single code point';
7498         }
7499         else {
7500             $match = 'match';
7501             $code_points = "$string_count code points";
7502         }
7503
7504         my $synonyms;
7505         my $entries;
7506         if ($total_entries == 1) {
7507             $synonyms = "";
7508             $entries = 'entry';
7509             $any_of_these = 'this'
7510         }
7511         else {
7512             $synonyms = " any of the following regular expression constructs";
7513             $entries = 'entries';
7514             $any_of_these = 'any of these'
7515         }
7516
7517         my $comment = "";
7518         if ($has_ucd_alias) {
7519             $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
7520         }
7521         if ($has_unrelated) {
7522             $comment .= <<END;
7523 This file is for tables that are not necessarily related:  To conserve
7524 resources, every table that matches the identical set of code points in this
7525 version of Unicode uses this file.  Each one is listed in a separate group
7526 below.  It could be that the tables will match the same set of code points in
7527 other Unicode releases, or it could be purely coincidence that they happen to
7528 be the same in Unicode $string_version, and hence may not in other versions.
7529
7530 END
7531         }
7532
7533         if (%flags) {
7534             foreach my $flag (sort keys %flags) {
7535                 $comment .= <<END;
7536 '$flag' below means that this form is $flags{$flag}.
7537 Consult $pod_file.pod
7538 END
7539             }
7540             $comment .= "\n";
7541         }
7542
7543         if ($total_entries == 0) {
7544             Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
7545             $comment .= <<END;
7546 This file returns the $code_points in Unicode Version $string_version for
7547 $leader, but it is inaccessible through Perl regular expressions, as
7548 "\\p{prop=}" is not recognized.
7549 END
7550
7551         } else {
7552             $comment .= <<END;
7553 This file returns the $code_points in Unicode Version $string_version that
7554 $match$synonyms:
7555
7556 $matches_comment
7557 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
7558 including if adding or subtracting white space, underscore, and hyphen
7559 characters matters or doesn't matter, and other permissible syntactic
7560 variants.  Upper/lower case distinctions never matter.
7561 END
7562
7563         }
7564         if ($compound_name) {
7565             $comment .= <<END;
7566
7567 A colon can be substituted for the equals sign, and
7568 END
7569             if ($properties_with_compound_names > 1) {
7570                 $comment .= <<END;
7571 within each group above,
7572 END
7573             }
7574             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7575
7576             # Note the \b below, it says don't make that line a continuation.
7577             $comment .= <<END;
7578 anything to the left of the equals (or colon) can be combined with anything to
7579 the right.  Thus, for example,
7580 $compound_name
7581 \bis also valid.
7582 END
7583         }
7584
7585         # And append any comment(s) from the actual tables.  They are all
7586         # gathered here, so may not read all that well.
7587         if (@global_comments) {
7588             $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7589         }
7590
7591         if ($count) {   # The format differs if no code points, and needs no
7592                         # explanation in that case
7593                 $comment.= <<END;
7594
7595 The format of the lines of this file is:
7596 END
7597             $comment.= <<END;
7598 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7599 STOP is the ending point, or if omitted, the range has just one code point.
7600 END
7601             if ($leader->output_range_counts) {
7602                 $comment .= <<END;
7603 Numbers in comments in [brackets] indicate how many code points are in the
7604 range.
7605 END
7606             }
7607         }
7608
7609         $leader->set_comment(main::join_lines($comment));
7610         return;
7611     }
7612
7613     # Accessors for the underlying list
7614     for my $sub (qw(
7615                     get_valid_code_point
7616                     get_invalid_code_point
7617                 ))
7618     {
7619         no strict "refs";
7620         *$sub = sub {
7621             use strict "refs";
7622             my $self = shift;
7623
7624             return $self->_range_list->$sub(@_);
7625         }
7626     }
7627 } # End closure for Match_Table
7628
7629 package Property;
7630
7631 # The Property class represents a Unicode property, or the $perl
7632 # pseudo-property.  It contains a map table initialized empty at construction
7633 # time, and for properties accessible through regular expressions, various
7634 # match tables, created through the add_match_table() method, and referenced
7635 # by the table('NAME') or tables() methods, the latter returning a list of all
7636 # of the match tables.  Otherwise table operations implicitly are for the map
7637 # table.
7638 #
7639 # Most of the data in the property is actually about its map table, so it
7640 # mostly just uses that table's accessors for most methods.  The two could
7641 # have been combined into one object, but for clarity because of their
7642 # differing semantics, they have been kept separate.  It could be argued that
7643 # the 'file' and 'directory' fields should be kept with the map table.
7644 #
7645 # Each property has a type.  This can be set in the constructor, or in the
7646 # set_type accessor, but mostly it is figured out by the data.  Every property
7647 # starts with unknown type, overridden by a parameter to the constructor, or
7648 # as match tables are added, or ranges added to the map table, the data is
7649 # inspected, and the type changed.  After the table is mostly or entirely
7650 # filled, compute_type() should be called to finalize they analysis.
7651 #
7652 # There are very few operations defined.  One can safely remove a range from
7653 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7654 # table to this one, replacing any in the intersection of the two.
7655
7656 sub standardize { return main::standardize($_[0]); }
7657 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7658
7659 {   # Closure
7660
7661     # This hash will contain as keys, all the aliases of all properties, and
7662     # as values, pointers to their respective property objects.  This allows
7663     # quick look-up of a property from any of its names.
7664     my %alias_to_property_of;
7665
7666     sub dump_alias_to_property_of {
7667         # For debugging
7668
7669         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7670         return;
7671     }
7672
7673     sub property_ref {
7674         # This is a package subroutine, not called as a method.
7675         # If the single parameter is a literal '*' it returns a list of all
7676         # defined properties.
7677         # Otherwise, the single parameter is a name, and it returns a pointer
7678         # to the corresponding property object, or undef if none.
7679         #
7680         # Properties can have several different names.  The 'standard' form of
7681         # each of them is stored in %alias_to_property_of as they are defined.
7682         # But it's possible that this subroutine will be called with some
7683         # variant, so if the initial lookup fails, it is repeated with the
7684         # standardized form of the input name.  If found, besides returning the
7685         # result, the input name is added to the list so future calls won't
7686         # have to do the conversion again.
7687
7688         my $name = shift;
7689
7690         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7691
7692         if (! defined $name) {
7693             Carp::my_carp_bug("Undefined input property.  No action taken.");
7694             return;
7695         }
7696
7697         return main::uniques(values %alias_to_property_of) if $name eq '*';
7698
7699         # Return cached result if have it.
7700         my $result = $alias_to_property_of{$name};
7701         return $result if defined $result;
7702
7703         # Convert the input to standard form.
7704         my $standard_name = standardize($name);
7705
7706         $result = $alias_to_property_of{$standard_name};
7707         return unless defined $result;        # Don't cache undefs
7708
7709         # Cache the result before returning it.
7710         $alias_to_property_of{$name} = $result;
7711         return $result;
7712     }
7713
7714
7715     main::setup_package();
7716
7717     my %map;
7718     # A pointer to the map table object for this property
7719     main::set_access('map', \%map);
7720
7721     my %full_name;
7722     # The property's full name.  This is a duplicate of the copy kept in the
7723     # map table, but is needed because stringify needs it during
7724     # construction of the map table, and then would have a chicken before egg
7725     # problem.
7726     main::set_access('full_name', \%full_name, 'r');
7727
7728     my %table_ref;
7729     # This hash will contain as keys, all the aliases of any match tables
7730     # attached to this property, and as values, the pointers to their
7731     # respective tables.  This allows quick look-up of a table from any of its
7732     # names.
7733     main::set_access('table_ref', \%table_ref);
7734
7735     my %type;
7736     # The type of the property, $ENUM, $BINARY, etc
7737     main::set_access('type', \%type, 'r');
7738
7739     my %file;
7740     # The filename where the map table will go (if actually written).
7741     # Normally defaulted, but can be overridden.
7742     main::set_access('file', \%file, 'r', 's');
7743
7744     my %directory;
7745     # The directory where the map table will go (if actually written).
7746     # Normally defaulted, but can be overridden.
7747     main::set_access('directory', \%directory, 's');
7748
7749     my %pseudo_map_type;
7750     # This is used to affect the calculation of the map types for all the
7751     # ranges in the table.  It should be set to one of the values that signify
7752     # to alter the calculation.
7753     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7754
7755     my %has_only_code_point_maps;
7756     # A boolean used to help in computing the type of data in the map table.
7757     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7758
7759     my %unique_maps;
7760     # A list of the first few distinct mappings this property has.  This is
7761     # used to disambiguate between binary and enum property types, so don't
7762     # have to keep more than three.
7763     main::set_access('unique_maps', \%unique_maps);
7764
7765     my %pre_declared_maps;
7766     # A boolean that gives whether the input data should declare all the
7767     # tables used, or not.  If the former, unknown ones raise a warning.
7768     main::set_access('pre_declared_maps',
7769                                     \%pre_declared_maps, 'r', 's');
7770
7771     sub new {
7772         # The only required parameter is the positionally first, name.  All
7773         # other parameters are key => value pairs.  See the documentation just
7774         # above for the meanings of the ones not passed directly on to the map
7775         # table constructor.
7776
7777         my $class = shift;
7778         my $name = shift || "";
7779
7780         my $self = property_ref($name);
7781         if (defined $self) {
7782             my $options_string = join ", ", @_;
7783             $options_string = ".  Ignoring options $options_string" if $options_string;
7784             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
7785             return $self;
7786         }
7787
7788         my %args = @_;
7789
7790         $self = bless \do { my $anonymous_scalar }, $class;
7791         my $addr = do { no overloading; pack 'J', $self; };
7792
7793         $directory{$addr} = delete $args{'Directory'};
7794         $file{$addr} = delete $args{'File'};
7795         $full_name{$addr} = delete $args{'Full_Name'} || $name;
7796         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7797         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
7798         $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7799                                     # Starting in this release, property
7800                                     # values should be defined for all
7801                                     # properties, except those overriding this
7802                                     // $v_version ge v5.1.0;
7803
7804         # Rest of parameters passed on.
7805
7806         $has_only_code_point_maps{$addr} = 1;
7807         $table_ref{$addr} = { };
7808         $unique_maps{$addr} = { };
7809
7810         $map{$addr} = Map_Table->new($name,
7811                                     Full_Name => $full_name{$addr},
7812                                     _Alias_Hash => \%alias_to_property_of,
7813                                     _Property => $self,
7814                                     %args);
7815         return $self;
7816     }
7817
7818     # See this program's beginning comment block about overloading the copy
7819     # constructor.  Few operations are defined on properties, but a couple are
7820     # useful.  It is safe to take the inverse of a property, and to remove a
7821     # single code point from it.
7822     use overload
7823         fallback => 0,
7824         qw("") => "_operator_stringify",
7825         "." => \&main::_operator_dot,
7826         ".=" => \&main::_operator_dot_equal,
7827         '==' => \&main::_operator_equal,
7828         '!=' => \&main::_operator_not_equal,
7829         '=' => sub { return shift },
7830         '-=' => "_minus_and_equal",
7831     ;
7832
7833     sub _operator_stringify {
7834         return "Property '" .  shift->full_name . "'";
7835     }
7836
7837     sub _minus_and_equal {
7838         # Remove a single code point from the map table of a property.
7839
7840         my $self = shift;
7841         my $other = shift;
7842         my $reversed = shift;
7843         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7844
7845         if (ref $other) {
7846             Carp::my_carp_bug("Bad news.  Can't cope with a "
7847                         . ref($other)
7848                         . " argument to '-='.  Subtraction ignored.");
7849             return $self;
7850         }
7851         elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
7852             Carp::my_carp_bug("Bad news.  Can't cope with subtracting a "
7853             . ref $self
7854             . " from a non-object.  undef returned.");
7855             return;
7856         }
7857         else {
7858             no overloading;
7859             $map{pack 'J', $self}->delete_range($other, $other);
7860         }
7861         return $self;
7862     }
7863
7864     sub add_match_table {
7865         # Add a new match table for this property, with name given by the
7866         # parameter.  It returns a pointer to the table.
7867
7868         my $self = shift;
7869         my $name = shift;
7870         my %args = @_;
7871
7872         my $addr = do { no overloading; pack 'J', $self; };
7873
7874         my $table = $table_ref{$addr}{$name};
7875         my $standard_name = main::standardize($name);
7876         if (defined $table
7877             || (defined ($table = $table_ref{$addr}{$standard_name})))
7878         {
7879             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
7880             $table_ref{$addr}{$name} = $table;
7881             return $table;
7882         }
7883         else {
7884
7885             # See if this is a perl extension, if not passed in.
7886             my $perl_extension = delete $args{'Perl_Extension'};
7887             $perl_extension
7888                         = $self->perl_extension if ! defined $perl_extension;
7889
7890             $table = Match_Table->new(
7891                                 Name => $name,
7892                                 Perl_Extension => $perl_extension,
7893                                 _Alias_Hash => $table_ref{$addr},
7894                                 _Property => $self,
7895
7896                                 # gets property's fate and status by default
7897                                 Fate => $self->fate,
7898                                 Status => $self->status,
7899                                 _Status_Info => $self->status_info,
7900                                 %args);
7901             return unless defined $table;
7902         }
7903
7904         # Save the names for quick look up
7905         $table_ref{$addr}{$standard_name} = $table;
7906         $table_ref{$addr}{$name} = $table;
7907
7908         # Perhaps we can figure out the type of this property based on the
7909         # fact of adding this match table.  First, string properties don't
7910         # have match tables; second, a binary property can't have 3 match
7911         # tables
7912         if ($type{$addr} == $UNKNOWN) {
7913             $type{$addr} = $NON_STRING;
7914         }
7915         elsif ($type{$addr} == $STRING) {
7916             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
7917             $type{$addr} = $NON_STRING;
7918         }
7919         elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
7920             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7921                 && $type{$addr} == $BINARY)
7922             {
7923                 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
7924                 $type{$addr} = $ENUM;
7925             }
7926         }
7927
7928         return $table;
7929     }
7930
7931     sub delete_match_table {
7932         # Delete the table referred to by $2 from the property $1.
7933
7934         my $self = shift;
7935         my $table_to_remove = shift;
7936         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7937
7938         my $addr = do { no overloading; pack 'J', $self; };
7939
7940         # Remove all names that refer to it.
7941         foreach my $key (keys %{$table_ref{$addr}}) {
7942             delete $table_ref{$addr}{$key}
7943                                 if $table_ref{$addr}{$key} == $table_to_remove;
7944         }
7945
7946         $table_to_remove->DESTROY;
7947         return;
7948     }
7949
7950     sub table {
7951         # Return a pointer to the match table (with name given by the
7952         # parameter) associated with this property; undef if none.
7953
7954         my $self = shift;
7955         my $name = shift;
7956         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7957
7958         my $addr = do { no overloading; pack 'J', $self; };
7959
7960         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7961
7962         # If quick look-up failed, try again using the standard form of the
7963         # input name.  If that succeeds, cache the result before returning so
7964         # won't have to standardize this input name again.
7965         my $standard_name = main::standardize($name);
7966         return unless defined $table_ref{$addr}{$standard_name};
7967
7968         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7969         return $table_ref{$addr}{$name};
7970     }
7971
7972     sub tables {
7973         # Return a list of pointers to all the match tables attached to this
7974         # property
7975
7976         no overloading;
7977         return main::uniques(values %{$table_ref{pack 'J', shift}});
7978     }
7979
7980     sub directory {
7981         # Returns the directory the map table for this property should be
7982         # output in.  If a specific directory has been specified, that has
7983         # priority;  'undef' is returned if the type isn't defined;
7984         # or $map_directory for everything else.
7985
7986         my $addr = do { no overloading; pack 'J', shift; };
7987
7988         return $directory{$addr} if defined $directory{$addr};
7989         return undef if $type{$addr} == $UNKNOWN;
7990         return $map_directory;
7991     }
7992
7993     sub swash_name {
7994         # Return the name that is used to both:
7995         #   1)  Name the file that the map table is written to.
7996         #   2)  The name of swash related stuff inside that file.
7997         # The reason for this is that the Perl core historically has used
7998         # certain names that aren't the same as the Unicode property names.
7999         # To continue using these, $file is hard-coded in this file for those,
8000         # but otherwise the standard name is used.  This is different from the
8001         # external_name, so that the rest of the files, like in lib can use
8002         # the standard name always, without regard to historical precedent.
8003
8004         my $self = shift;
8005         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8006
8007         my $addr = do { no overloading; pack 'J', $self; };
8008
8009         # Swash names are used only on regular map tables; otherwise there
8010         # should be no access to the property map table from other parts of
8011         # Perl.
8012         return if $map{$addr}->fate != $ORDINARY;
8013
8014         return $file{$addr} if defined $file{$addr};
8015         return $map{$addr}->external_name;
8016     }
8017
8018     sub to_create_match_tables {
8019         # Returns a boolean as to whether or not match tables should be
8020         # created for this property.
8021
8022         my $self = shift;
8023         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8024
8025         # The whole point of this pseudo property is match tables.
8026         return 1 if $self == $perl;
8027
8028         my $addr = do { no overloading; pack 'J', $self; };
8029
8030         # Don't generate tables of code points that match the property values
8031         # of a string property.  Such a list would most likely have many
8032         # property values, each with just one or very few code points mapping
8033         # to it.
8034         return 0 if $type{$addr} == $STRING;
8035
8036         # Don't generate anything for unimplemented properties.
8037         return 0 if grep { $self->complete_name eq $_ }
8038                                                     @unimplemented_properties;
8039         # Otherwise, do.
8040         return 1;
8041     }
8042
8043     sub property_add_or_replace_non_nulls {
8044         # This adds the mappings in the property $other to $self.  Non-null
8045         # mappings from $other override those in $self.  It essentially merges
8046         # the two properties, with the second having priority except for null
8047         # mappings.
8048
8049         my $self = shift;
8050         my $other = shift;
8051         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8052
8053         if (! $other->isa(__PACKAGE__)) {
8054             Carp::my_carp_bug("$other should be a "
8055                             . __PACKAGE__
8056                             . ".  Not a '"
8057                             . ref($other)
8058                             . "'.  Not added;");
8059             return;
8060         }
8061
8062         no overloading;
8063         return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
8064     }
8065
8066     sub set_proxy_for {
8067         # Certain tables are not generally written out to files, but
8068         # Unicode::UCD has the intelligence to know that the file for $self
8069         # can be used to reconstruct those tables.  This routine just changes
8070         # things so that UCD pod entries for those suppressed tables are
8071         # generated, so the fact that a proxy is used is invisible to the
8072         # user.
8073
8074         my $self = shift;
8075
8076         foreach my $property_name (@_) {
8077             my $ref = property_ref($property_name);
8078             next if $ref->to_output_map;
8079             $ref->set_fate($MAP_PROXIED);
8080         }
8081     }
8082
8083     sub set_type {
8084         # Set the type of the property.  Mostly this is figured out by the
8085         # data in the table.  But this is used to set it explicitly.  The
8086         # reason it is not a standard accessor is that when setting a binary
8087         # property, we need to make sure that all the true/false aliases are
8088         # present, as they were omitted in early Unicode releases.
8089
8090         my $self = shift;
8091         my $type = shift;
8092         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8093
8094         if ($type != $ENUM
8095             && $type != $BINARY
8096             && $type != $FORCED_BINARY
8097             && $type != $STRING)
8098         {
8099             Carp::my_carp("Unrecognized type '$type'.  Type not set");
8100             return;
8101         }
8102
8103         { no overloading; $type{pack 'J', $self} = $type; }
8104         return if $type != $BINARY && $type != $FORCED_BINARY;
8105
8106         my $yes = $self->table('Y');
8107         $yes = $self->table('Yes') if ! defined $yes;
8108         $yes = $self->add_match_table('Y', Full_Name => 'Yes')
8109                                                             if ! defined $yes;
8110
8111         # Add aliases in order wanted, duplicates will be ignored.  We use a
8112         # binary property present in all releases for its ordered lists of
8113         # true/false aliases.  Note, that could run into problems in
8114         # outputting things in that we don't distinguish between the name and
8115         # full name of these.  Hopefully, if the table was already created
8116         # before this code is executed, it was done with these set properly.
8117         my $bm = property_ref("Bidi_Mirrored");
8118         foreach my $alias ($bm->table("Y")->aliases) {
8119             $yes->add_alias($alias->name);
8120         }
8121         my $no = $self->table('N');
8122         $no = $self->table('No') if ! defined $no;
8123         $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
8124         foreach my $alias ($bm->table("N")->aliases) {
8125             $no->add_alias($alias->name);
8126         }
8127
8128         return;
8129     }
8130
8131     sub add_map {
8132         # Add a map to the property's map table.  This also keeps
8133         # track of the maps so that the property type can be determined from
8134         # its data.
8135
8136         my $self = shift;
8137         my $start = shift;  # First code point in range
8138         my $end = shift;    # Final code point in range
8139         my $map = shift;    # What the range maps to.
8140         # Rest of parameters passed on.
8141
8142         my $addr = do { no overloading; pack 'J', $self; };
8143
8144         # If haven't the type of the property, gather information to figure it
8145         # out.
8146         if ($type{$addr} == $UNKNOWN) {
8147
8148             # If the map contains an interior blank or dash, or most other
8149             # nonword characters, it will be a string property.  This
8150             # heuristic may actually miss some string properties.  If so, they
8151             # may need to have explicit set_types called for them.  This
8152             # happens in the Unihan properties.
8153             if ($map =~ / (?<= . ) [ -] (?= . ) /x
8154                 || $map =~ / [^\w.\/\ -]  /x)
8155             {
8156                 $self->set_type($STRING);
8157
8158                 # $unique_maps is used for disambiguating between ENUM and
8159                 # BINARY later; since we know the property is not going to be
8160                 # one of those, no point in keeping the data around
8161                 undef $unique_maps{$addr};
8162             }
8163             else {
8164
8165                 # Not necessarily a string.  The final decision has to be
8166                 # deferred until all the data are in.  We keep track of if all
8167                 # the values are code points for that eventual decision.
8168                 $has_only_code_point_maps{$addr} &=
8169                                             $map =~ / ^ $code_point_re $/x;
8170
8171                 # For the purposes of disambiguating between binary and other
8172                 # enumerations at the end, we keep track of the first three
8173                 # distinct property values.  Once we get to three, we know
8174                 # it's not going to be binary, so no need to track more.
8175                 if (scalar keys %{$unique_maps{$addr}} < 3) {
8176                     $unique_maps{$addr}{main::standardize($map)} = 1;
8177                 }
8178             }
8179         }
8180
8181         # Add the mapping by calling our map table's method
8182         return $map{$addr}->add_map($start, $end, $map, @_);
8183     }
8184
8185     sub compute_type {
8186         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
8187         # should be called after the property is mostly filled with its maps.
8188         # We have been keeping track of what the property values have been,
8189         # and now have the necessary information to figure out the type.
8190
8191         my $self = shift;
8192         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8193
8194         my $addr = do { no overloading; pack 'J', $self; };
8195
8196         my $type = $type{$addr};
8197
8198         # If already have figured these out, no need to do so again, but we do
8199         # a double check on ENUMS to make sure that a string property hasn't
8200         # improperly been classified as an ENUM, so continue on with those.
8201         return if $type == $STRING
8202                   || $type == $BINARY
8203                   || $type == $FORCED_BINARY;
8204
8205         # If every map is to a code point, is a string property.
8206         if ($type == $UNKNOWN
8207             && ($has_only_code_point_maps{$addr}
8208                 || (defined $map{$addr}->default_map
8209                     && $map{$addr}->default_map eq "")))
8210         {
8211             $self->set_type($STRING);
8212         }
8213         else {
8214
8215             # Otherwise, it is to some sort of enumeration.  (The case where
8216             # it is a Unicode miscellaneous property, and treated like a
8217             # string in this program is handled in add_map()).  Distinguish
8218             # between binary and some other enumeration type.  Of course, if
8219             # there are more than two values, it's not binary.  But more
8220             # subtle is the test that the default mapping is defined means it
8221             # isn't binary.  This in fact may change in the future if Unicode
8222             # changes the way its data is structured.  But so far, no binary
8223             # properties ever have @missing lines for them, so the default map
8224             # isn't defined for them.  The few properties that are two-valued
8225             # and aren't considered binary have the default map defined
8226             # starting in Unicode 5.0, when the @missing lines appeared; and
8227             # this program has special code to put in a default map for them
8228             # for earlier than 5.0 releases.
8229             if ($type == $ENUM
8230                 || scalar keys %{$unique_maps{$addr}} > 2
8231                 || defined $self->default_map)
8232             {
8233                 my $tables = $self->tables;
8234                 my $count = $self->count;
8235                 if ($verbosity && $count > 500 && $tables/$count > .1) {
8236                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
8237                 }
8238                 $self->set_type($ENUM);
8239             }
8240             else {
8241                 $self->set_type($BINARY);
8242             }
8243         }
8244         undef $unique_maps{$addr};  # Garbage collect
8245         return;
8246     }
8247
8248     sub set_fate {
8249         my $self = shift;
8250         my $fate = shift;
8251         my $reason = shift;  # Ignored unless suppressing
8252         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8253
8254         my $addr = do { no overloading; pack 'J', $self; };
8255         if ($fate == $SUPPRESSED) {
8256             $why_suppressed{$self->complete_name} = $reason;
8257         }
8258
8259         # Each table shares the property's fate, except that MAP_PROXIED
8260         # doesn't affect match tables
8261         $map{$addr}->set_fate($fate, $reason);
8262         if ($fate != $MAP_PROXIED) {
8263             foreach my $table ($map{$addr}, $self->tables) {
8264                 $table->set_fate($fate, $reason);
8265             }
8266         }
8267         return;
8268     }
8269
8270
8271     # Most of the accessors for a property actually apply to its map table.
8272     # Setup up accessor functions for those, referring to %map
8273     for my $sub (qw(
8274                     add_alias
8275                     add_anomalous_entry
8276                     add_comment
8277                     add_conflicting
8278                     add_description
8279                     add_duplicate
8280                     add_note
8281                     aliases
8282                     comment
8283                     complete_name
8284                     containing_range
8285                     count
8286                     default_map
8287                     delete_range
8288                     description
8289                     each_range
8290                     external_name
8291                     fate
8292                     file_path
8293                     format
8294                     initialize
8295                     inverse_list
8296                     is_empty
8297                     name
8298                     note
8299                     perl_extension
8300                     property
8301                     range_count
8302                     ranges
8303                     range_size_1
8304                     reset_each_range
8305                     set_comment
8306                     set_default_map
8307                     set_file_path
8308                     set_final_comment
8309                     _set_format
8310                     set_range_size_1
8311                     set_status
8312                     set_to_output_map
8313                     short_name
8314                     status
8315                     status_info
8316                     to_output_map
8317                     type_of
8318                     value_of
8319                     write
8320                 ))
8321                     # 'property' above is for symmetry, so that one can take
8322                     # the property of a property and get itself, and so don't
8323                     # have to distinguish between properties and tables in
8324                     # calling code
8325     {
8326         no strict "refs";
8327         *$sub = sub {
8328             use strict "refs";
8329             my $self = shift;
8330             no overloading;
8331             return $map{pack 'J', $self}->$sub(@_);
8332         }
8333     }
8334
8335
8336 } # End closure
8337
8338 package main;
8339
8340 sub join_lines($) {
8341     # Returns lines of the input joined together, so that they can be folded
8342     # properly.
8343     # This causes continuation lines to be joined together into one long line
8344     # for folding.  A continuation line is any line that doesn't begin with a
8345     # space or "\b" (the latter is stripped from the output).  This is so
8346     # lines can be be in a HERE document so as to fit nicely in the terminal
8347     # width, but be joined together in one long line, and then folded with
8348     # indents, '#' prefixes, etc, properly handled.
8349     # A blank separates the joined lines except if there is a break; an extra
8350     # blank is inserted after a period ending a line.
8351
8352     # Initialize the return with the first line.
8353     my ($return, @lines) = split "\n", shift;
8354
8355     # If the first line is null, it was an empty line, add the \n back in
8356     $return = "\n" if $return eq "";
8357
8358     # Now join the remainder of the physical lines.
8359     for my $line (@lines) {
8360
8361         # An empty line means wanted a blank line, so add two \n's to get that
8362         # effect, and go to the next line.
8363         if (length $line == 0) {
8364             $return .= "\n\n";
8365             next;
8366         }
8367
8368         # Look at the last character of what we have so far.
8369         my $previous_char = substr($return, -1, 1);
8370
8371         # And at the next char to be output.
8372         my $next_char = substr($line, 0, 1);
8373
8374         if ($previous_char ne "\n") {
8375
8376             # Here didn't end wth a nl.  If the next char a blank or \b, it
8377             # means that here there is a break anyway.  So add a nl to the
8378             # output.
8379             if ($next_char eq " " || $next_char eq "\b") {
8380                 $previous_char = "\n";
8381                 $return .= $previous_char;
8382             }
8383
8384             # Add an extra space after periods.
8385             $return .= " " if $previous_char eq '.';
8386         }
8387
8388         # Here $previous_char is still the latest character to be output.  If
8389         # it isn't a nl, it means that the next line is to be a continuation
8390         # line, with a blank inserted between them.
8391         $return .= " " if $previous_char ne "\n";
8392
8393         # Get rid of any \b
8394         substr($line, 0, 1) = "" if $next_char eq "\b";
8395
8396         # And append this next line.
8397         $return .= $line;
8398     }
8399
8400     return $return;
8401 }
8402
8403 sub simple_fold($;$$$) {
8404     # Returns a string of the input (string or an array of strings) folded
8405     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8406     # a \n
8407     # This is tailored for the kind of text written by this program,
8408     # especially the pod file, which can have very long names with
8409     # underscores in the middle, or words like AbcDefgHij....  We allow
8410     # breaking in the middle of such constructs if the line won't fit
8411     # otherwise.  The break in such cases will come either just after an
8412     # underscore, or just before one of the Capital letters.
8413
8414     local $to_trace = 0 if main::DEBUG;
8415
8416     my $line = shift;
8417     my $prefix = shift;     # Optional string to prepend to each output
8418                             # line
8419     $prefix = "" unless defined $prefix;
8420
8421     my $hanging_indent = shift; # Optional number of spaces to indent
8422                                 # continuation lines
8423     $hanging_indent = 0 unless $hanging_indent;
8424
8425     my $right_margin = shift;   # Optional number of spaces to narrow the
8426                                 # total width by.
8427     $right_margin = 0 unless defined $right_margin;
8428
8429     # Call carp with the 'nofold' option to avoid it from trying to call us
8430     # recursively
8431     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8432
8433     # The space available doesn't include what's automatically prepended
8434     # to each line, or what's reserved on the right.
8435     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8436     # XXX Instead of using the 'nofold' perhaps better to look up the stack
8437
8438     if (DEBUG && $hanging_indent >= $max) {
8439         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
8440         $hanging_indent = 0;
8441     }
8442
8443     # First, split into the current physical lines.
8444     my @line;
8445     if (ref $line) {        # Better be an array, because not bothering to
8446                             # test
8447         foreach my $line (@{$line}) {
8448             push @line, split /\n/, $line;
8449         }
8450     }
8451     else {
8452         @line = split /\n/, $line;
8453     }
8454
8455     #local $to_trace = 1 if main::DEBUG;
8456     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8457
8458     # Look at each current physical line.
8459     for (my $i = 0; $i < @line; $i++) {
8460         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8461         #local $to_trace = 1 if main::DEBUG;
8462         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8463
8464         # Remove prefix, because will be added back anyway, don't want
8465         # doubled prefix
8466         $line[$i] =~ s/^$prefix//;
8467
8468         # Remove trailing space
8469         $line[$i] =~ s/\s+\Z//;
8470
8471         # If the line is too long, fold it.
8472         if (length $line[$i] > $max) {
8473             my $remainder;
8474
8475             # Here needs to fold.  Save the leading space in the line for
8476             # later.
8477             $line[$i] =~ /^ ( \s* )/x;
8478             my $leading_space = $1;
8479             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8480
8481             # If character at final permissible position is white space,
8482             # fold there, which will delete that white space
8483             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8484                 $remainder = substr($line[$i], $max);
8485                 $line[$i] = substr($line[$i], 0, $max - 1);
8486             }
8487             else {
8488
8489                 # Otherwise fold at an acceptable break char closest to
8490                 # the max length.  Look at just the maximal initial
8491                 # segment of the line
8492                 my $segment = substr($line[$i], 0, $max - 1);
8493                 if ($segment =~
8494                     /^ ( .{$hanging_indent}   # Don't look before the
8495                                               #  indent.
8496                         \ *                   # Don't look in leading
8497                                               #  blanks past the indent
8498                             [^ ] .*           # Find the right-most
8499                         (?:                   #  acceptable break:
8500                             [ \s = ]          # space or equal
8501                             | - (?! [.0-9] )  # or non-unary minus.
8502                         )                     # $1 includes the character
8503                     )/x)
8504                 {
8505                     # Split into the initial part that fits, and remaining
8506                     # part of the input
8507                     $remainder = substr($line[$i], length $1);
8508                     $line[$i] = $1;
8509                     trace $line[$i] if DEBUG && $to_trace;
8510                     trace $remainder if DEBUG && $to_trace;
8511                 }
8512
8513                 # If didn't find a good breaking spot, see if there is a
8514                 # not-so-good breaking spot.  These are just after
8515                 # underscores or where the case changes from lower to
8516                 # upper.  Use \a as a soft hyphen, but give up
8517                 # and don't break the line if there is actually a \a
8518                 # already in the input.  We use an ascii character for the
8519                 # soft-hyphen to avoid any attempt by miniperl to try to
8520                 # access the files that this program is creating.
8521                 elsif ($segment !~ /\a/
8522                        && ($segment =~ s/_/_\a/g
8523                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8524                 {
8525                     # Here were able to find at least one place to insert
8526                     # our substitute soft hyphen.  Find the right-most one
8527                     # and replace it by a real hyphen.
8528                     trace $segment if DEBUG && $to_trace;
8529                     substr($segment,
8530                             rindex($segment, "\a"),
8531                             1) = '-';
8532
8533                     # Then remove the soft hyphen substitutes.
8534                     $segment =~ s/\a//g;
8535                     trace $segment if DEBUG && $to_trace;
8536
8537                     # And split into the initial part that fits, and
8538                     # remainder of the line
8539                     my $pos = rindex($segment, '-');
8540                     $remainder = substr($line[$i], $pos);
8541                     trace $remainder if DEBUG && $to_trace;
8542                     $line[$i] = substr($segment, 0, $pos + 1);
8543                 }
8544             }
8545
8546             # Here we know if we can fold or not.  If we can, $remainder
8547             # is what remains to be processed in the next iteration.
8548             if (defined $remainder) {
8549                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8550
8551                 # Insert the folded remainder of the line as a new element
8552                 # of the array.  (It may still be too long, but we will
8553                 # deal with that next time through the loop.)  Omit any
8554                 # leading space in the remainder.
8555                 $remainder =~ s/^\s+//;
8556                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8557
8558                 # But then indent by whichever is larger of:
8559                 # 1) the leading space on the input line;
8560                 # 2) the hanging indent.
8561                 # This preserves indentation in the original line.
8562                 my $lead = ($leading_space)
8563                             ? length $leading_space
8564                             : $hanging_indent;
8565                 $lead = max($lead, $hanging_indent);
8566                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8567             }
8568         }
8569
8570         # Ready to output the line. Get rid of any trailing space
8571         # And prefix by the required $prefix passed in.
8572         $line[$i] =~ s/\s+$//;
8573         $line[$i] = "$prefix$line[$i]\n";
8574     } # End of looping through all the lines.
8575
8576     return join "", @line;
8577 }
8578
8579 sub property_ref {  # Returns a reference to a property object.
8580     return Property::property_ref(@_);
8581 }
8582
8583 sub force_unlink ($) {
8584     my $filename = shift;
8585     return unless file_exists($filename);
8586     return if CORE::unlink($filename);
8587
8588     # We might need write permission
8589     chmod 0777, $filename;
8590     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
8591     return;
8592 }
8593
8594 sub write ($$@) {
8595     # Given a filename and references to arrays of lines, write the lines of
8596     # each array to the file
8597     # Filename can be given as an arrayref of directory names
8598
8599     return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
8600
8601     my $file  = shift;
8602     my $use_utf8 = shift;
8603
8604     # Get into a single string if an array, and get rid of, in Unix terms, any
8605     # leading '.'
8606     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8607     $file = File::Spec->canonpath($file);
8608
8609     # If has directories, make sure that they all exist
8610     (undef, my $directories, undef) = File::Spec->splitpath($file);
8611     File::Path::mkpath($directories) if $directories && ! -d $directories;
8612
8613     push @files_actually_output, $file;
8614
8615     force_unlink ($file);
8616
8617     my $OUT;
8618     if (not open $OUT, ">", $file) {
8619         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
8620         return;
8621     }
8622
8623     binmode $OUT, ":utf8" if $use_utf8;
8624
8625     while (defined (my $lines_ref = shift)) {
8626         unless (@$lines_ref) {
8627             Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8628         }
8629
8630         print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8631     }
8632     close $OUT or die Carp::my_carp("close '$file' failed: $!");
8633
8634     print "$file written.\n" if $verbosity >= $VERBOSE;
8635
8636     return;
8637 }
8638
8639
8640 sub Standardize($) {
8641     # This converts the input name string into a standardized equivalent to
8642     # use internally.
8643
8644     my $name = shift;
8645     unless (defined $name) {
8646       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
8647       return;
8648     }
8649
8650     # Remove any leading or trailing white space
8651     $name =~ s/^\s+//g;
8652     $name =~ s/\s+$//g;
8653
8654     # Convert interior white space and hyphens into underscores.
8655     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8656
8657     # Capitalize the letter following an underscore, and convert a sequence of
8658     # multiple underscores to a single one
8659     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8660
8661     # And capitalize the first letter, but not for the special cjk ones.
8662     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8663     return $name;
8664 }
8665
8666 sub standardize ($) {
8667     # Returns a lower-cased standardized name, without underscores.  This form
8668     # is chosen so that it can distinguish between any real versus superficial
8669     # Unicode name differences.  It relies on the fact that Unicode doesn't
8670     # have interior underscores, white space, nor dashes in any
8671     # stricter-matched name.  It should not be used on Unicode code point
8672     # names (the Name property), as they mostly, but not always follow these
8673     # rules.
8674
8675     my $name = Standardize(shift);
8676     return if !defined $name;
8677
8678     $name =~ s/ (?<= .) _ (?= . ) //xg;
8679     return lc $name;
8680 }
8681
8682 sub utf8_heavy_name ($$) {
8683     # Returns the name that utf8_heavy.pl will use to find a table.  XXX
8684     # perhaps this function should be placed somewhere, like Heavy.pl so that
8685     # utf8_heavy can use it directly without duplicating code that can get
8686     # out-of sync.
8687
8688     my $table = shift;
8689     my $alias = shift;
8690     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8691
8692     my $property = $table->property;
8693     $property = ($property == $perl)
8694                 ? ""                # 'perl' is never explicitly stated
8695                 : standardize($property->name) . '=';
8696     if ($alias->loose_match) {
8697         return $property . standardize($alias->name);
8698     }
8699     else {
8700         return lc ($property . $alias->name);
8701     }
8702
8703     return;
8704 }
8705
8706 {   # Closure
8707
8708     my $indent_increment = " " x (($debugging_build) ? 2 : 0);
8709     my %already_output;
8710
8711     $main::simple_dumper_nesting = 0;
8712
8713     sub simple_dumper {
8714         # Like Simple Data::Dumper. Good enough for our needs. We can't use
8715         # the real thing as we have to run under miniperl.
8716
8717         # It is designed so that on input it is at the beginning of a line,
8718         # and the final thing output in any call is a trailing ",\n".
8719
8720         my $item = shift;
8721         my $indent = shift;
8722         $indent = "" if ! $debugging_build || ! defined $indent;
8723
8724         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8725
8726         # nesting level is localized, so that as the call stack pops, it goes
8727         # back to the prior value.
8728         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8729         undef %already_output if $main::simple_dumper_nesting == 0;
8730         $main::simple_dumper_nesting++;
8731         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8732
8733         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8734
8735         # Determine the indent for recursive calls.
8736         my $next_indent = $indent . $indent_increment;
8737
8738         my $output;
8739         if (! ref $item) {
8740
8741             # Dump of scalar: just output it in quotes if not a number.  To do
8742             # so we must escape certain characters, and therefore need to
8743             # operate on a copy to avoid changing the original
8744             my $copy = $item;
8745             $copy = $UNDEF unless defined $copy;
8746
8747             # Quote non-integers (integers also have optional leading '-')
8748             if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
8749
8750                 # Escape apostrophe and backslash
8751                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8752                 $copy = "'$copy'";
8753             }
8754             $output = "$indent$copy,\n";
8755         }
8756         else {
8757
8758             # Keep track of cycles in the input, and refuse to infinitely loop
8759             my $addr = do { no overloading; pack 'J', $item; };
8760             if (defined $already_output{$addr}) {
8761                 return "${indent}ALREADY OUTPUT: $item\n";
8762             }
8763             $already_output{$addr} = $item;
8764
8765             if (ref $item eq 'ARRAY') {
8766                 my $using_brackets;
8767                 $output = $indent;
8768                 if ($main::simple_dumper_nesting > 1) {
8769                     $output .= '[';
8770                     $using_brackets = 1;
8771                 }
8772                 else {
8773                     $using_brackets = 0;
8774                 }
8775
8776                 # If the array is empty, put the closing bracket on the same
8777                 # line.  Otherwise, recursively add each array element
8778                 if (@$item == 0) {
8779                     $output .= " ";
8780                 }
8781                 else {
8782                     $output .= "\n";
8783                     for (my $i = 0; $i < @$item; $i++) {
8784
8785                         # Indent array elements one level
8786                         $output .= &simple_dumper($item->[$i], $next_indent);
8787                         next if ! $debugging_build;
8788                         $output =~ s/\n$//;      # Remove any trailing nl so
8789                         $output .= " # [$i]\n";  # as to add a comment giving
8790                                                  # the array index
8791                     }
8792                     $output .= $indent;     # Indent closing ']' to orig level
8793                 }
8794                 $output .= ']' if $using_brackets;
8795                 $output .= ",\n";
8796             }
8797             elsif (ref $item eq 'HASH') {
8798                 my $is_first_line;
8799                 my $using_braces;
8800                 my $body_indent;
8801
8802                 # No surrounding braces at top level
8803                 $output .= $indent;
8804                 if ($main::simple_dumper_nesting > 1) {
8805                     $output .= "{\n";
8806                     $is_first_line = 0;
8807                     $body_indent = $next_indent;
8808                     $next_indent .= $indent_increment;
8809                     $using_braces = 1;
8810                 }
8811                 else {
8812                     $is_first_line = 1;
8813                     $body_indent = $indent;
8814                     $using_braces = 0;
8815                 }
8816
8817                 # Output hashes sorted alphabetically instead of apparently
8818                 # random.  Use caseless alphabetic sort
8819                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8820                 {
8821                     if ($is_first_line) {
8822                         $is_first_line = 0;
8823                     }
8824                     else {
8825                         $output .= "$body_indent";
8826                     }
8827
8828                     # The key must be a scalar, but this recursive call quotes
8829                     # it
8830                     $output .= &simple_dumper($key);
8831
8832                     # And change the trailing comma and nl to the hash fat
8833                     # comma for clarity, and so the value can be on the same
8834                     # line
8835                     $output =~ s/,\n$/ => /;
8836
8837                     # Recursively call to get the value's dump.
8838                     my $next = &simple_dumper($item->{$key}, $next_indent);
8839
8840                     # If the value is all on one line, remove its indent, so
8841                     # will follow the => immediately.  If it takes more than
8842                     # one line, start it on a new line.
8843                     if ($next !~ /\n.*\n/) {
8844                         $next =~ s/^ *//;
8845                     }
8846                     else {
8847                         $output .= "\n";
8848                     }
8849                     $output .= $next;
8850                 }
8851
8852                 $output .= "$indent},\n" if $using_braces;
8853             }
8854             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8855                 $output = $indent . ref($item) . "\n";
8856                 # XXX see if blessed
8857             }
8858             elsif ($item->can('dump')) {
8859
8860                 # By convention in this program, objects furnish a 'dump'
8861                 # method.  Since not doing any output at this level, just pass
8862                 # on the input indent
8863                 $output = $item->dump($indent);
8864             }
8865             else {
8866                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
8867             }
8868         }
8869         return $output;
8870     }
8871 }
8872
8873 sub dump_inside_out {
8874     # Dump inside-out hashes in an object's state by converting them to a
8875     # regular hash and then calling simple_dumper on that.
8876
8877     my $object = shift;
8878     my $fields_ref = shift;
8879     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8880
8881     my $addr = do { no overloading; pack 'J', $object; };
8882
8883     my %hash;
8884     foreach my $key (keys %$fields_ref) {
8885         $hash{$key} = $fields_ref->{$key}{$addr};
8886     }
8887
8888     return simple_dumper(\%hash, @_);
8889 }
8890
8891 sub _operator_dot {
8892     # Overloaded '.' method that is common to all packages.  It uses the
8893     # package's stringify method.
8894
8895     my $self = shift;
8896     my $other = shift;
8897     my $reversed = shift;
8898     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8899
8900     $other = "" unless defined $other;
8901
8902     foreach my $which (\$self, \$other) {
8903         next unless ref $$which;
8904         if ($$which->can('_operator_stringify')) {
8905             $$which = $$which->_operator_stringify;
8906         }
8907         else {
8908             my $ref = ref $$which;
8909             my $addr = do { no overloading; pack 'J', $$which; };
8910             $$which = "$ref ($addr)";
8911         }
8912     }
8913     return ($reversed)
8914             ? "$other$self"
8915             : "$self$other";
8916 }
8917
8918 sub _operator_dot_equal {
8919     # Overloaded '.=' method that is common to all packages.
8920
8921     my $self = shift;
8922     my $other = shift;
8923     my $reversed = shift;
8924     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8925
8926     $other = "" unless defined $other;
8927
8928     if ($reversed) {
8929         return $other .= "$self";
8930     }
8931     else {
8932         return "$self" . "$other";
8933     }
8934 }
8935
8936 sub _operator_equal {
8937     # Generic overloaded '==' routine.  To be equal, they must be the exact
8938     # same object
8939
8940     my $self = shift;
8941     my $other = shift;
8942
8943     return 0 unless defined $other;
8944     return 0 unless ref $other;
8945     no overloading;
8946     return $self == $other;
8947 }
8948
8949 sub _operator_not_equal {
8950     my $self = shift;
8951     my $other = shift;
8952
8953     return ! _operator_equal($self, $other);
8954 }
8955
8956 sub process_PropertyAliases($) {
8957     # This reads in the PropertyAliases.txt file, which contains almost all
8958     # the character properties in Unicode and their equivalent aliases:
8959     # scf       ; Simple_Case_Folding         ; sfc
8960     #
8961     # Field 0 is the preferred short name for the property.
8962     # Field 1 is the full name.
8963     # Any succeeding ones are other accepted names.
8964
8965     my $file= shift;
8966     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8967
8968     # This whole file was non-existent in early releases, so use our own
8969     # internal one.
8970     $file->insert_lines(get_old_property_aliases())
8971                                                 if ! -e 'PropertyAliases.txt';
8972
8973     # Add any cjk properties that may have been defined.
8974     $file->insert_lines(@cjk_properties);
8975
8976     while ($file->next_line) {
8977
8978         my @data = split /\s*;\s*/;
8979
8980         my $full = $data[1];
8981
8982         my $this = Property->new($data[0], Full_Name => $full);
8983
8984         # Start looking for more aliases after these two.
8985         for my $i (2 .. @data - 1) {
8986             $this->add_alias($data[$i]);
8987         }
8988
8989     }
8990
8991     my $scf = property_ref("Simple_Case_Folding");
8992     $scf->add_alias("scf");
8993     $scf->add_alias("sfc");
8994
8995     return;
8996 }
8997
8998 sub finish_property_setup {
8999     # Finishes setting up after PropertyAliases.
9000
9001     my $file = shift;
9002     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9003
9004     # This entry was missing from this file in earlier Unicode versions
9005     if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
9006         Property->new('JSN', Full_Name => 'Jamo_Short_Name');
9007     }
9008
9009     # These two properties must be defined in all releases so we can generate
9010     # the tables from them to make regex \X work, but suppress their output so
9011     # aren't application visible prior to releases where they should be
9012     if (! defined property_ref('GCB')) {
9013         Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
9014                       Fate => $PLACEHOLDER);
9015     }
9016     if (! defined property_ref('hst')) {
9017         Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
9018                       Fate => $PLACEHOLDER);
9019     }
9020
9021     # These are used so much, that we set globals for them.
9022     $gc = property_ref('General_Category');
9023     $block = property_ref('Block');
9024     $script = property_ref('Script');
9025
9026     # Perl adds this alias.
9027     $gc->add_alias('Category');
9028
9029     # Unicode::Normalize expects this file with this name and directory.
9030     my $ccc = property_ref('Canonical_Combining_Class');
9031     if (defined $ccc) {
9032         $ccc->set_file('CombiningClass');
9033         $ccc->set_directory(File::Spec->curdir());
9034     }
9035
9036     # These two properties aren't actually used in the core, but unfortunately
9037     # the names just above that are in the core interfere with these, so
9038     # choose different names.  These aren't a problem unless the map tables
9039     # for these files get written out.
9040     my $lowercase = property_ref('Lowercase');
9041     $lowercase->set_file('IsLower') if defined $lowercase;
9042     my $uppercase = property_ref('Uppercase');
9043     $uppercase->set_file('IsUpper') if defined $uppercase;
9044
9045     # Set up the hard-coded default mappings, but only on properties defined
9046     # for this release
9047     foreach my $property (keys %default_mapping) {
9048         my $property_object = property_ref($property);
9049         next if ! defined $property_object;
9050         my $default_map = $default_mapping{$property};
9051         $property_object->set_default_map($default_map);
9052
9053         # A map of <code point> implies the property is string.
9054         if ($property_object->type == $UNKNOWN
9055             && $default_map eq $CODE_POINT)
9056         {
9057             $property_object->set_type($STRING);
9058         }
9059     }
9060
9061     # The following use the Multi_Default class to create objects for
9062     # defaults.
9063
9064     # Bidi class has a complicated default, but the derived file takes care of
9065     # the complications, leaving just 'L'.
9066     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
9067         property_ref('Bidi_Class')->set_default_map('L');
9068     }
9069     else {
9070         my $default;
9071
9072         # The derived file was introduced in 3.1.1.  The values below are
9073         # taken from table 3-8, TUS 3.0
9074         my $default_R =
9075             'my $default = Range_List->new;
9076              $default->add_range(0x0590, 0x05FF);
9077              $default->add_range(0xFB1D, 0xFB4F);'
9078         ;
9079
9080         # The defaults apply only to unassigned characters
9081         $default_R .= '$gc->table("Unassigned") & $default;';
9082
9083         if ($v_version lt v3.0.0) {
9084             $default = Multi_Default->new(R => $default_R, 'L');
9085         }
9086         else {
9087
9088             # AL apparently not introduced until 3.0:  TUS 2.x references are
9089             # not on-line to check it out
9090             my $default_AL =
9091                 'my $default = Range_List->new;
9092                  $default->add_range(0x0600, 0x07BF);
9093                  $default->add_range(0xFB50, 0xFDFF);
9094                  $default->add_range(0xFE70, 0xFEFF);'
9095             ;
9096
9097             # Non-character code points introduced in this release; aren't AL
9098             if ($v_version ge 3.1.0) {
9099                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
9100             }
9101             $default_AL .= '$gc->table("Unassigned") & $default';
9102             $default = Multi_Default->new(AL => $default_AL,
9103                                           R => $default_R,
9104                                           'L');
9105         }
9106         property_ref('Bidi_Class')->set_default_map($default);
9107     }
9108
9109     # Joining type has a complicated default, but the derived file takes care
9110     # of the complications, leaving just 'U' (or Non_Joining), except the file
9111     # is bad in 3.1.0
9112     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
9113         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
9114             property_ref('Joining_Type')->set_default_map('Non_Joining');
9115         }
9116         else {
9117
9118             # Otherwise, there are not one, but two possibilities for the
9119             # missing defaults: T and U.
9120             # The missing defaults that evaluate to T are given by:
9121             # T = Mn + Cf - ZWNJ - ZWJ
9122             # where Mn and Cf are the general category values. In other words,
9123             # any non-spacing mark or any format control character, except
9124             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
9125             # WIDTH JOINER (joining type C).
9126             my $default = Multi_Default->new(
9127                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
9128                'Non_Joining');
9129             property_ref('Joining_Type')->set_default_map($default);
9130         }
9131     }
9132
9133     # Line break has a complicated default in early releases. It is 'Unknown'
9134     # for non-assigned code points; 'AL' for assigned.
9135     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
9136         my $lb = property_ref('Line_Break');
9137         if ($v_version gt 3.2.0) {
9138             $lb->set_default_map('Unknown');
9139         }
9140         else {
9141             my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
9142                                               'AL');
9143             $lb->set_default_map($default);
9144         }
9145
9146         # If has the URS property, make sure that the standard aliases are in
9147         # it, since not in the input tables in some versions.
9148         my $urs = property_ref('Unicode_Radical_Stroke');
9149         if (defined $urs) {
9150             $urs->add_alias('cjkRSUnicode');
9151             $urs->add_alias('kRSUnicode');
9152         }
9153     }
9154
9155     # For backwards compatibility with applications that may read the mapping
9156     # file directly (it was documented in 5.12 and 5.14 as being thusly
9157     # usable), keep it from being adjusted.  (range_size_1 is
9158     # used to force the traditional format.)
9159     if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
9160         $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
9161         $nfkc_cf->set_range_size_1(1);
9162     }
9163     if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
9164         $bmg->set_to_output_map($EXTERNAL_MAP);
9165         $bmg->set_range_size_1(1);
9166     }
9167
9168     property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
9169
9170     return;
9171 }
9172
9173 sub get_old_property_aliases() {
9174     # Returns what would be in PropertyAliases.txt if it existed in very old
9175     # versions of Unicode.  It was derived from the one in 3.2, and pared
9176     # down based on the data that was actually in the older releases.
9177     # An attempt was made to use the existence of files to mean inclusion or
9178     # not of various aliases, but if this was not sufficient, using version
9179     # numbers was resorted to.
9180
9181     my @return;
9182
9183     # These are to be used in all versions (though some are constructed by
9184     # this program if missing)
9185     push @return, split /\n/, <<'END';
9186 bc        ; Bidi_Class
9187 Bidi_M    ; Bidi_Mirrored
9188 cf        ; Case_Folding
9189 ccc       ; Canonical_Combining_Class
9190 dm        ; Decomposition_Mapping
9191 dt        ; Decomposition_Type
9192 gc        ; General_Category
9193 isc       ; ISO_Comment
9194 lc        ; Lowercase_Mapping
9195 na        ; Name
9196 na1       ; Unicode_1_Name
9197 nt        ; Numeric_Type
9198 nv        ; Numeric_Value
9199 scf       ; Simple_Case_Folding
9200 slc       ; Simple_Lowercase_Mapping
9201 stc       ; Simple_Titlecase_Mapping
9202 suc       ; Simple_Uppercase_Mapping
9203 tc        ; Titlecase_Mapping
9204 uc        ; Uppercase_Mapping
9205 END
9206
9207     if (-e 'Blocks.txt') {
9208         push @return, "blk       ; Block\n";
9209     }
9210     if (-e 'ArabicShaping.txt') {
9211         push @return, split /\n/, <<'END';
9212 jg        ; Joining_Group
9213 jt        ; Joining_Type
9214 END
9215     }
9216     if (-e 'PropList.txt') {
9217
9218         # This first set is in the original old-style proplist.
9219         push @return, split /\n/, <<'END';
9220 Bidi_C    ; Bidi_Control
9221 Dash      ; Dash
9222 Dia       ; Diacritic
9223 Ext       ; Extender
9224 Hex       ; Hex_Digit
9225 Hyphen    ; Hyphen
9226 IDC       ; ID_Continue
9227 Ideo      ; Ideographic
9228 Join_C    ; Join_Control
9229 Math      ; Math
9230 QMark     ; Quotation_Mark
9231 Term      ; Terminal_Punctuation
9232 WSpace    ; White_Space
9233 END
9234         # The next sets were added later
9235         if ($v_version ge v3.0.0) {
9236             push @return, split /\n/, <<'END';
9237 Upper     ; Uppercase
9238 Lower     ; Lowercase
9239 END
9240         }
9241         if ($v_version ge v3.0.1) {
9242             push @return, split /\n/, <<'END';
9243 NChar     ; Noncharacter_Code_Point
9244 END
9245         }
9246         # The next sets were added in the new-style
9247         if ($v_version ge v3.1.0) {
9248             push @return, split /\n/, <<'END';
9249 OAlpha    ; Other_Alphabetic
9250 OLower    ; Other_Lowercase
9251 OMath     ; Other_Math
9252 OUpper    ; Other_Uppercase
9253 END
9254         }
9255         if ($v_version ge v3.1.1) {
9256             push @return, "AHex      ; ASCII_Hex_Digit\n";
9257         }
9258     }
9259     if (-e 'EastAsianWidth.txt') {
9260         push @return, "ea        ; East_Asian_Width\n";
9261     }
9262     if (-e 'CompositionExclusions.txt') {
9263         push @return, "CE        ; Composition_Exclusion\n";
9264     }
9265     if (-e 'LineBreak.txt') {
9266         push @return, "lb        ; Line_Break\n";
9267     }
9268     if (-e 'BidiMirroring.txt') {
9269         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
9270     }
9271     if (-e 'Scripts.txt') {
9272         push @return, "sc        ; Script\n";
9273     }
9274     if (-e 'DNormalizationProps.txt') {
9275         push @return, split /\n/, <<'END';
9276 Comp_Ex   ; Full_Composition_Exclusion
9277 FC_NFKC   ; FC_NFKC_Closure
9278 NFC_QC    ; NFC_Quick_Check
9279 NFD_QC    ; NFD_Quick_Check
9280 NFKC_QC   ; NFKC_Quick_Check
9281 NFKD_QC   ; NFKD_Quick_Check
9282 XO_NFC    ; Expands_On_NFC
9283 XO_NFD    ; Expands_On_NFD
9284 XO_NFKC   ; Expands_On_NFKC
9285 XO_NFKD   ; Expands_On_NFKD
9286 END
9287     }
9288     if (-e 'DCoreProperties.txt') {
9289         push @return, split /\n/, <<'END';
9290 Alpha     ; Alphabetic
9291 IDS       ; ID_Start
9292 XIDC      ; XID_Continue
9293 XIDS      ; XID_Start
9294 END
9295         # These can also appear in some versions of PropList.txt
9296         push @return, "Lower     ; Lowercase\n"
9297                                     unless grep { $_ =~ /^Lower\b/} @return;
9298         push @return, "Upper     ; Uppercase\n"
9299                                     unless grep { $_ =~ /^Upper\b/} @return;
9300     }
9301
9302     # This flag requires the DAge.txt file to be copied into the directory.
9303     if (DEBUG && $compare_versions) {
9304         push @return, 'age       ; Age';
9305     }
9306
9307     return @return;
9308 }
9309
9310 sub process_PropValueAliases {
9311     # This file contains values that properties look like:
9312     # bc ; AL        ; Arabic_Letter
9313     # blk; n/a       ; Greek_And_Coptic                 ; Greek
9314     #
9315     # Field 0 is the property.
9316     # Field 1 is the short name of a property value or 'n/a' if no
9317     #                short name exists;
9318     # Field 2 is the full property value name;
9319     # Any other fields are more synonyms for the property value.
9320     # Purely numeric property values are omitted from the file; as are some
9321     # others, fewer and fewer in later releases
9322
9323     # Entries for the ccc property have an extra field before the
9324     # abbreviation:
9325     # ccc;   0; NR   ; Not_Reordered
9326     # It is the numeric value that the names are synonyms for.
9327
9328     # There are comment entries for values missing from this file:
9329     # # @missing: 0000..10FFFF; ISO_Comment; <none>
9330     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
9331
9332     my $file= shift;
9333     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9334
9335     # This whole file was non-existent in early releases, so use our own
9336     # internal one if necessary.
9337     if (! -e 'PropValueAliases.txt') {
9338         $file->insert_lines(get_old_property_value_aliases());
9339     }
9340
9341     if ($v_version lt 4.0.0) {
9342         $file->insert_lines(split /\n/, <<'END'
9343 hst; L                                ; Leading_Jamo
9344 hst; LV                               ; LV_Syllable
9345 hst; LVT                              ; LVT_Syllable
9346 hst; NA                               ; Not_Applicable
9347 hst; T                                ; Trailing_Jamo
9348 hst; V                                ; Vowel_Jamo
9349 END
9350         );
9351     }
9352     if ($v_version lt 4.1.0) {
9353         $file->insert_lines(split /\n/, <<'END'
9354 GCB; CN                               ; Control
9355 GCB; CR                               ; CR
9356 GCB; EX                               ; Extend
9357 GCB; L                                ; L
9358 GCB; LF                               ; LF
9359 GCB; LV                               ; LV
9360 GCB; LVT                              ; LVT
9361 GCB; T                                ; T
9362 GCB; V                                ; V
9363 GCB; XX                               ; Other
9364 END
9365         );
9366     }
9367
9368
9369     # Add any explicit cjk values
9370     $file->insert_lines(@cjk_property_values);
9371
9372     # This line is used only for testing the code that checks for name
9373     # conflicts.  There is a script Inherited, and when this line is executed
9374     # it causes there to be a name conflict with the 'Inherited' that this
9375     # program generates for this block property value
9376     #$file->insert_lines('blk; n/a; Herited');
9377
9378
9379     # Process each line of the file ...
9380     while ($file->next_line) {
9381
9382         # Fix typo in input file
9383         s/CCC133/CCC132/g if $v_version eq v6.1.0;
9384
9385         my ($property, @data) = split /\s*;\s*/;
9386
9387         # The ccc property has an extra field at the beginning, which is the
9388         # numeric value.  Move it to be after the other two, mnemonic, fields,
9389         # so that those will be used as the property value's names, and the
9390         # number will be an extra alias.  (Rightmost splice removes field 1-2,
9391         # returning them in a slice; left splice inserts that before anything,
9392         # thus shifting the former field 0 to after them.)
9393         splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
9394
9395         # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
9396         # there is no short name, use the full one in element 1
9397         if ($data[0] eq "n/a") {
9398             $data[0] = $data[1];
9399         }
9400         elsif ($data[0] ne $data[1]
9401                && standardize($data[0]) eq standardize($data[1])
9402                && $data[1] !~ /[[:upper:]]/)
9403         {
9404             # Also, there is a bug in the file in which "n/a" is omitted, and
9405             # the two fields are identical except for case, and the full name
9406             # is all lower case.  Copy the "short" name unto the full one to
9407             # give it some upper case.
9408
9409             $data[1] = $data[0];
9410         }
9411
9412         # Earlier releases had the pseudo property 'qc' that should expand to
9413         # the ones that replace it below.
9414         if ($property eq 'qc') {
9415             if (lc $data[0] eq 'y') {
9416                 $file->insert_lines('NFC_QC; Y      ; Yes',
9417                                     'NFD_QC; Y      ; Yes',
9418                                     'NFKC_QC; Y     ; Yes',
9419                                     'NFKD_QC; Y     ; Yes',
9420                                     );
9421             }
9422             elsif (lc $data[0] eq 'n') {
9423                 $file->insert_lines('NFC_QC; N      ; No',
9424                                     'NFD_QC; N      ; No',
9425                                     'NFKC_QC; N     ; No',
9426                                     'NFKD_QC; N     ; No',
9427                                     );
9428             }
9429             elsif (lc $data[0] eq 'm') {
9430                 $file->insert_lines('NFC_QC; M      ; Maybe',
9431                                     'NFKC_QC; M     ; Maybe',
9432                                     );
9433             }
9434             else {
9435                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
9436             }
9437             next;
9438         }
9439
9440         # The first field is the short name, 2nd is the full one.
9441         my $property_object = property_ref($property);
9442         my $table = $property_object->add_match_table($data[0],
9443                                                 Full_Name => $data[1]);
9444
9445         # Start looking for more aliases after these two.
9446         for my $i (2 .. @data - 1) {
9447             $table->add_alias($data[$i]);
9448         }
9449     } # End of looping through the file
9450
9451     # As noted in the comments early in the program, it generates tables for
9452     # the default values for all releases, even those for which the concept
9453     # didn't exist at the time.  Here we add those if missing.
9454     my $age = property_ref('age');
9455     if (defined $age && ! defined $age->table('Unassigned')) {
9456         $age->add_match_table('Unassigned');
9457     }
9458     $block->add_match_table('No_Block') if -e 'Blocks.txt'
9459                                     && ! defined $block->table('No_Block');
9460
9461
9462     # Now set the default mappings of the properties from the file.  This is
9463     # done after the loop because a number of properties have only @missings
9464     # entries in the file, and may not show up until the end.
9465     my @defaults = $file->get_missings;
9466     foreach my $default_ref (@defaults) {
9467         my $default = $default_ref->[0];
9468         my $property = property_ref($default_ref->[1]);
9469         $property->set_default_map($default);
9470     }
9471     return;
9472 }
9473
9474 sub get_old_property_value_aliases () {
9475     # Returns what would be in PropValueAliases.txt if it existed in very old
9476     # versions of Unicode.  It was derived from the one in 3.2, and pared
9477     # down.  An attempt was made to use the existence of files to mean
9478     # inclusion or not of various aliases, but if this was not sufficient,
9479     # using version numbers was resorted to.
9480
9481     my @return = split /\n/, <<'END';
9482 bc ; AN        ; Arabic_Number
9483 bc ; B         ; Paragraph_Separator
9484 bc ; CS        ; Common_Separator
9485 bc ; EN        ; European_Number
9486 bc ; ES        ; European_Separator
9487 bc ; ET        ; European_Terminator
9488 bc ; L         ; Left_To_Right
9489 bc ; ON        ; Other_Neutral
9490 bc ; R         ; Right_To_Left
9491 bc ; WS        ; White_Space
9492
9493 Bidi_M; N; No; F; False
9494 Bidi_M; Y; Yes; T; True
9495
9496 # The standard combining classes are very much different in v1, so only use
9497 # ones that look right (not checked thoroughly)
9498 ccc;   0; NR   ; Not_Reordered
9499 ccc;   1; OV   ; Overlay
9500 ccc;   7; NK   ; Nukta
9501 ccc;   8; KV   ; Kana_Voicing
9502 ccc;   9; VR   ; Virama
9503 ccc; 202; ATBL ; Attached_Below_Left
9504 ccc; 216; ATAR ; Attached_Above_Right
9505 ccc; 218; BL   ; Below_Left
9506 ccc; 220; B    ; Below
9507 ccc; 222; BR   ; Below_Right
9508 ccc; 224; L    ; Left
9509 ccc; 228; AL   ; Above_Left
9510 ccc; 230; A    ; Above
9511 ccc; 232; AR   ; Above_Right
9512 ccc; 234; DA   ; Double_Above
9513
9514 dt ; can       ; canonical
9515 dt ; enc       ; circle
9516 dt ; fin       ; final
9517 dt ; font      ; font
9518 dt ; fra       ; fraction
9519 dt ; init      ; initial
9520 dt ; iso       ; isolated
9521 dt ; med       ; medial
9522 dt ; n/a       ; none
9523 dt ; nb        ; noBreak
9524 dt ; sqr       ; square
9525 dt ; sub       ; sub
9526 dt ; sup       ; super
9527
9528 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
9529 gc ; Cc        ; Control
9530 gc ; Cn        ; Unassigned
9531 gc ; Co        ; Private_Use
9532 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
9533 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
9534 gc ; Ll        ; Lowercase_Letter
9535 gc ; Lm        ; Modifier_Letter
9536 gc ; Lo        ; Other_Letter
9537 gc ; Lu        ; Uppercase_Letter
9538 gc ; M         ; Mark                             # Mc | Me | Mn
9539 gc ; Mc        ; Spacing_Mark
9540 gc ; Mn        ; Nonspacing_Mark
9541 gc ; N         ; Number                           # Nd | Nl | No
9542 gc ; Nd        ; Decimal_Number
9543 gc ; No        ; Other_Number
9544 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
9545 gc ; Pd        ; Dash_Punctuation
9546 gc ; Pe        ; Close_Punctuation
9547 gc ; Po        ; Other_Punctuation
9548 gc ; Ps        ; Open_Punctuation
9549 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
9550 gc ; Sc        ; Currency_Symbol
9551 gc ; Sm        ; Math_Symbol
9552 gc ; So        ; Other_Symbol
9553 gc ; Z         ; Separator                        # Zl | Zp | Zs
9554 gc ; Zl        ; Line_Separator
9555 gc ; Zp        ; Paragraph_Separator
9556 gc ; Zs        ; Space_Separator
9557
9558 nt ; de        ; Decimal
9559 nt ; di        ; Digit
9560 nt ; n/a       ; None
9561 nt ; nu        ; Numeric
9562 END
9563
9564     if (-e 'ArabicShaping.txt') {
9565         push @return, split /\n/, <<'END';
9566 jg ; n/a       ; AIN
9567 jg ; n/a       ; ALEF
9568 jg ; n/a       ; DAL
9569 jg ; n/a       ; GAF
9570 jg ; n/a       ; LAM
9571 jg ; n/a       ; MEEM
9572 jg ; n/a       ; NO_JOINING_GROUP
9573 jg ; n/a       ; NOON
9574 jg ; n/a       ; QAF
9575 jg ; n/a       ; SAD
9576 jg ; n/a       ; SEEN
9577 jg ; n/a       ; TAH
9578 jg ; n/a       ; WAW
9579
9580 jt ; C         ; Join_Causing
9581 jt ; D         ; Dual_Joining
9582 jt ; L         ; Left_Joining
9583 jt ; R         ; Right_Joining
9584 jt ; U         ; Non_Joining
9585 jt ; T         ; Transparent
9586 END
9587         if ($v_version ge v3.0.0) {
9588             push @return, split /\n/, <<'END';
9589 jg ; n/a       ; ALAPH
9590 jg ; n/a       ; BEH
9591 jg ; n/a       ; BETH
9592 jg ; n/a       ; DALATH_RISH
9593 jg ; n/a       ; E
9594 jg ; n/a       ; FEH
9595 jg ; n/a       ; FINAL_SEMKATH
9596 jg ; n/a       ; GAMAL
9597 jg ; n/a       ; HAH
9598 jg ; n/a       ; HAMZA_ON_HEH_GOAL
9599 jg ; n/a       ; HE
9600 jg ; n/a       ; HEH
9601 jg ; n/a       ; HEH_GOAL
9602 jg ; n/a       ; HETH
9603 jg ; n/a       ; KAF
9604 jg ; n/a       ; KAPH
9605 jg ; n/a       ; KNOTTED_HEH
9606 jg ; n/a       ; LAMADH
9607 jg ; n/a       ; MIM
9608 jg ; n/a       ; NUN
9609 jg ; n/a       ; PE
9610 jg ; n/a       ; QAPH
9611 jg ; n/a       ; REH
9612 jg ; n/a       ; REVERSED_PE
9613 jg ; n/a       ; SADHE
9614 jg ; n/a       ; SEMKATH
9615 jg ; n/a       ; SHIN
9616 jg ; n/a       ; SWASH_KAF
9617 jg ; n/a       ; TAW
9618 jg ; n/a       ; TEH_MARBUTA
9619 jg ; n/a       ; TETH
9620 jg ; n/a       ; YEH
9621 jg ; n/a       ; YEH_BARREE
9622 jg ; n/a       ; YEH_WITH_TAIL
9623 jg ; n/a       ; YUDH
9624 jg ; n/a       ; YUDH_HE
9625 jg ; n/a       ; ZAIN
9626 END
9627         }
9628     }
9629
9630
9631     if (-e 'EastAsianWidth.txt') {
9632         push @return, split /\n/, <<'END';
9633 ea ; A         ; Ambiguous
9634 ea ; F         ; Fullwidth
9635 ea ; H         ; Halfwidth
9636 ea ; N         ; Neutral
9637 ea ; Na        ; Narrow
9638 ea ; W         ; Wide
9639 END
9640     }
9641
9642     if (-e 'LineBreak.txt') {
9643         push @return, split /\n/, <<'END';
9644 lb ; AI        ; Ambiguous
9645 lb ; AL        ; Alphabetic
9646 lb ; B2        ; Break_Both
9647 lb ; BA        ; Break_After
9648 lb ; BB        ; Break_Before
9649 lb ; BK        ; Mandatory_Break
9650 lb ; CB        ; Contingent_Break
9651 lb ; CL        ; Close_Punctuation
9652 lb ; CM        ; Combining_Mark
9653 lb ; CR        ; Carriage_Return
9654 lb ; EX        ; Exclamation
9655 lb ; GL        ; Glue
9656 lb ; HY        ; Hyphen
9657 lb ; ID        ; Ideographic
9658 lb ; IN        ; Inseperable
9659 lb ; IS        ; Infix_Numeric
9660 lb ; LF        ; Line_Feed
9661 lb ; NS        ; Nonstarter
9662 lb ; NU        ; Numeric
9663 lb ; OP        ; Open_Punctuation
9664 lb ; PO        ; Postfix_Numeric
9665 lb ; PR        ; Prefix_Numeric
9666 lb ; QU        ; Quotation
9667 lb ; SA        ; Complex_Context
9668 lb ; SG        ; Surrogate
9669 lb ; SP        ; Space
9670 lb ; SY        ; Break_Symbols
9671 lb ; XX        ; Unknown
9672 lb ; ZW        ; ZWSpace
9673 END
9674     }
9675
9676     if (-e 'DNormalizationProps.txt') {
9677         push @return, split /\n/, <<'END';
9678 qc ; M         ; Maybe
9679 qc ; N         ; No
9680 qc ; Y         ; Yes
9681 END
9682     }
9683
9684     if (-e 'Scripts.txt') {
9685         push @return, split /\n/, <<'END';
9686 sc ; Arab      ; Arabic
9687 sc ; Armn      ; Armenian
9688 sc ; Beng      ; Bengali
9689 sc ; Bopo      ; Bopomofo
9690 sc ; Cans      ; Canadian_Aboriginal
9691 sc ; Cher      ; Cherokee
9692 sc ; Cyrl      ; Cyrillic
9693 sc ; Deva      ; Devanagari
9694 sc ; Dsrt      ; Deseret
9695 sc ; Ethi      ; Ethiopic
9696 sc ; Geor      ; Georgian
9697 sc ; Goth      ; Gothic
9698 sc ; Grek      ; Greek
9699 sc ; Gujr      ; Gujarati
9700 sc ; Guru      ; Gurmukhi
9701 sc ; Hang      ; Hangul
9702 sc ; Hani      ; Han
9703 sc ; Hebr      ; Hebrew
9704 sc ; Hira      ; Hiragana
9705 sc ; Ital      ; Old_Italic
9706 sc ; Kana      ; Katakana
9707 sc ; Khmr      ; Khmer
9708 sc ; Knda      ; Kannada
9709 sc ; Laoo      ; Lao
9710 sc ; Latn      ; Latin
9711 sc ; Mlym      ; Malayalam
9712 sc ; Mong      ; Mongolian
9713 sc ; Mymr      ; Myanmar
9714 sc ; Ogam      ; Ogham
9715 sc ; Orya      ; Oriya
9716 sc ; Qaai      ; Inherited
9717 sc ; Runr      ; Runic
9718 sc ; Sinh      ; Sinhala
9719 sc ; Syrc      ; Syriac
9720 sc ; Taml      ; Tamil
9721 sc ; Telu      ; Telugu
9722 sc ; Thaa      ; Thaana
9723 sc ; Thai      ; Thai
9724 sc ; Tibt      ; Tibetan
9725 sc ; Yiii      ; Yi
9726 sc ; Zyyy      ; Common
9727 END
9728     }
9729
9730     if ($v_version ge v2.0.0) {
9731         push @return, split /\n/, <<'END';
9732 dt ; com       ; compat
9733 dt ; nar       ; narrow
9734 dt ; sml       ; small
9735 dt ; vert      ; vertical
9736 dt ; wide      ; wide
9737
9738 gc ; Cf        ; Format
9739 gc ; Cs        ; Surrogate
9740 gc ; Lt        ; Titlecase_Letter
9741 gc ; Me        ; Enclosing_Mark
9742 gc ; Nl        ; Letter_Number
9743 gc ; Pc        ; Connector_Punctuation
9744 gc ; Sk        ; Modifier_Symbol
9745 END
9746     }
9747     if ($v_version ge v2.1.2) {
9748         push @return, "bc ; S         ; Segment_Separator\n";
9749     }
9750     if ($v_version ge v2.1.5) {
9751         push @return, split /\n/, <<'END';
9752 gc ; Pf        ; Final_Punctuation
9753 gc ; Pi        ; Initial_Punctuation
9754 END
9755     }
9756     if ($v_version ge v2.1.8) {
9757         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
9758     }
9759
9760     if ($v_version ge v3.0.0) {
9761         push @return, split /\n/, <<'END';
9762 bc ; AL        ; Arabic_Letter
9763 bc ; BN        ; Boundary_Neutral
9764 bc ; LRE       ; Left_To_Right_Embedding
9765 bc ; LRO       ; Left_To_Right_Override
9766 bc ; NSM       ; Nonspacing_Mark
9767 bc ; PDF       ; Pop_Directional_Format
9768 bc ; RLE       ; Right_To_Left_Embedding
9769 bc ; RLO       ; Right_To_Left_Override
9770
9771 ccc; 233; DB   ; Double_Below
9772 END
9773     }
9774
9775     if ($v_version ge v3.1.0) {
9776         push @return, "ccc; 226; R    ; Right\n";
9777     }
9778
9779     return @return;
9780 }
9781
9782 sub process_NormalizationsTest {
9783
9784     # Each line looks like:
9785     #      source code point; NFC; NFD; NFKC; NFKD
9786     # e.g.
9787     #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
9788
9789     my $file= shift;
9790     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9791
9792     # Process each line of the file ...
9793     while ($file->next_line) {
9794
9795         next if /^@/;
9796
9797         my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
9798
9799         foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
9800             $$var = pack "U0U*", map { hex } split " ", $$var;
9801             $$var =~ s/(\\)/$1$1/g;
9802         }
9803
9804         push @normalization_tests,
9805                 "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
9806     } # End of looping through the file
9807 }
9808
9809 sub output_perl_charnames_line ($$) {
9810
9811     # Output the entries in Perl_charnames specially, using 5 digits instead
9812     # of four.  This makes the entries a constant length, and simplifies
9813     # charnames.pm which this table is for.  Unicode can have 6 digit
9814     # ordinals, but they are all private use or noncharacters which do not
9815     # have names, so won't be in this table.
9816
9817     return sprintf "%05X\t%s\n", $_[0], $_[1];
9818 }
9819
9820 { # Closure
9821     # This is used to store the range list of all the code points usable when
9822     # the little used $compare_versions feature is enabled.
9823     my $compare_versions_range_list;
9824
9825     # These are constants to the $property_info hash in this subroutine, to
9826     # avoid using a quoted-string which might have a typo.
9827     my $TYPE  = 'type';
9828     my $DEFAULT_MAP = 'default_map';
9829     my $DEFAULT_TABLE = 'default_table';
9830     my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
9831     my $MISSINGS = 'missings';
9832
9833     sub process_generic_property_file {
9834         # This processes a file containing property mappings and puts them
9835         # into internal map tables.  It should be used to handle any property
9836         # files that have mappings from a code point or range thereof to
9837         # something else.  This means almost all the UCD .txt files.
9838         # each_line_handlers() should be set to adjust the lines of these
9839         # files, if necessary, to what this routine understands:
9840         #
9841         # 0374          ; NFD_QC; N
9842         # 003C..003E    ; Math
9843         #
9844         # the fields are: "codepoint-range ; property; map"
9845         #
9846         # meaning the codepoints in the range all have the value 'map' under
9847         # 'property'.
9848         # Beginning and trailing white space in each field are not significant.
9849         # Note there is not a trailing semi-colon in the above.  A trailing
9850         # semi-colon means the map is a null-string.  An omitted map, as
9851         # opposed to a null-string, is assumed to be 'Y', based on Unicode
9852         # table syntax.  (This could have been hidden from this routine by
9853         # doing it in the $file object, but that would require parsing of the
9854         # line there, so would have to parse it twice, or change the interface
9855         # to pass this an array.  So not done.)
9856         #
9857         # The map field may begin with a sequence of commands that apply to
9858         # this range.  Each such command begins and ends with $CMD_DELIM.
9859         # These are used to indicate, for example, that the mapping for a
9860         # range has a non-default type.
9861         #
9862         # This loops through the file, calling it's next_line() method, and
9863         # then taking the map and adding it to the property's table.
9864         # Complications arise because any number of properties can be in the
9865         # file, in any order, interspersed in any way.  The first time a
9866         # property is seen, it gets information about that property and
9867         # caches it for quick retrieval later.  It also normalizes the maps
9868         # so that only one of many synonyms is stored.  The Unicode input
9869         # files do use some multiple synonyms.
9870
9871         my $file = shift;
9872         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9873
9874         my %property_info;               # To keep track of what properties
9875                                          # have already had entries in the
9876                                          # current file, and info about each,
9877                                          # so don't have to recompute.
9878         my $property_name;               # property currently being worked on
9879         my $property_type;               # and its type
9880         my $previous_property_name = ""; # name from last time through loop
9881         my $property_object;             # pointer to the current property's
9882                                          # object
9883         my $property_addr;               # the address of that object
9884         my $default_map;                 # the string that code points missing
9885                                          # from the file map to
9886         my $default_table;               # For non-string properties, a
9887                                          # reference to the match table that
9888                                          # will contain the list of code
9889                                          # points that map to $default_map.
9890
9891         # Get the next real non-comment line
9892         LINE:
9893         while ($file->next_line) {
9894
9895             # Default replacement type; means that if parts of the range have
9896             # already been stored in our tables, the new map overrides them if
9897             # they differ more than cosmetically
9898             my $replace = $IF_NOT_EQUIVALENT;
9899             my $map_type;            # Default type for the map of this range
9900
9901             #local $to_trace = 1 if main::DEBUG;
9902             trace $_ if main::DEBUG && $to_trace;
9903
9904             # Split the line into components
9905             my ($range, $property_name, $map, @remainder)
9906                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9907
9908             # If more or less on the line than we are expecting, warn and skip
9909             # the line
9910             if (@remainder) {
9911                 $file->carp_bad_line('Extra fields');
9912                 next LINE;
9913             }
9914             elsif ( ! defined $property_name) {
9915                 $file->carp_bad_line('Missing property');
9916                 next LINE;
9917             }
9918
9919             # Examine the range.
9920             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9921             {
9922                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9923                 next LINE;
9924             }
9925             my $low = hex $1;
9926             my $high = (defined $2) ? hex $2 : $low;
9927
9928             # For the very specialized case of comparing two Unicode
9929             # versions...
9930             if (DEBUG && $compare_versions) {
9931                 if ($property_name eq 'Age') {
9932
9933                     # Only allow code points at least as old as the version
9934                     # specified.
9935                     my $age = pack "C*", split(/\./, $map);        # v string
9936                     next LINE if $age gt $compare_versions;
9937                 }
9938                 else {
9939
9940                     # Again, we throw out code points younger than those of
9941                     # the specified version.  By now, the Age property is
9942                     # populated.  We use the intersection of each input range
9943                     # with this property to find what code points in it are
9944                     # valid.   To do the intersection, we have to convert the
9945                     # Age property map to a Range_list.  We only have to do
9946                     # this once.
9947                     if (! defined $compare_versions_range_list) {
9948                         my $age = property_ref('Age');
9949                         if (! -e 'DAge.txt') {
9950                             croak "Need to have 'DAge.txt' file to do version comparison";
9951                         }
9952                         elsif ($age->count == 0) {
9953                             croak "The 'Age' table is empty, but its file exists";
9954                         }
9955                         $compare_versions_range_list
9956                                         = Range_List->new(Initialize => $age);
9957                     }
9958
9959                     # An undefined map is always 'Y'
9960                     $map = 'Y' if ! defined $map;
9961
9962                     # Calculate the intersection of the input range with the
9963                     # code points that are known in the specified version
9964                     my @ranges = ($compare_versions_range_list
9965                                   & Range->new($low, $high))->ranges;
9966
9967                     # If the intersection is empty, throw away this range
9968                     next LINE unless @ranges;
9969
9970                     # Only examine the first range this time through the loop.
9971                     my $this_range = shift @ranges;
9972
9973                     # Put any remaining ranges in the queue to be processed
9974                     # later.  Note that there is unnecessary work here, as we
9975                     # will do the intersection again for each of these ranges
9976                     # during some future iteration of the LINE loop, but this
9977                     # code is not used in production.  The later intersections
9978                     # are guaranteed to not splinter, so this will not become
9979                     # an infinite loop.
9980                     my $line = join ';', $property_name, $map;
9981                     foreach my $range (@ranges) {
9982                         $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9983                                                             $range->start,
9984                                                             $range->end,
9985                                                             $line));
9986                     }
9987
9988                     # And process the first range, like any other.
9989                     $low = $this_range->start;
9990                     $high = $this_range->end;
9991                 }
9992             } # End of $compare_versions
9993
9994             # If changing to a new property, get the things constant per
9995             # property
9996             if ($previous_property_name ne $property_name) {
9997
9998                 $property_object = property_ref($property_name);
9999                 if (! defined $property_object) {
10000                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
10001                     next LINE;
10002                 }
10003                 { no overloading; $property_addr = pack 'J', $property_object; }
10004
10005                 # Defer changing names until have a line that is acceptable
10006                 # (the 'next' statement above means is unacceptable)
10007                 $previous_property_name = $property_name;
10008
10009                 # If not the first time for this property, retrieve info about
10010                 # it from the cache
10011                 if (defined ($property_info{$property_addr}{$TYPE})) {
10012                     $property_type = $property_info{$property_addr}{$TYPE};
10013                     $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
10014                     $map_type
10015                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
10016                     $default_table
10017                             = $property_info{$property_addr}{$DEFAULT_TABLE};
10018                 }
10019                 else {
10020
10021                     # Here, is the first time for this property.  Set up the
10022                     # cache.
10023                     $property_type = $property_info{$property_addr}{$TYPE}
10024                                    = $property_object->type;
10025                     $map_type
10026                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
10027                         = $property_object->pseudo_map_type;
10028
10029                     # The Unicode files are set up so that if the map is not
10030                     # defined, it is a binary property
10031                     if (! defined $map && $property_type != $BINARY) {
10032                         if ($property_type != $UNKNOWN
10033                             && $property_type != $NON_STRING)
10034                         {
10035                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
10036                         }
10037                         else {
10038                             $property_object->set_type($BINARY);
10039                             $property_type
10040                                 = $property_info{$property_addr}{$TYPE}
10041                                 = $BINARY;
10042                         }
10043                     }
10044
10045                     # Get any @missings default for this property.  This
10046                     # should precede the first entry for the property in the
10047                     # input file, and is located in a comment that has been
10048                     # stored by the Input_file class until we access it here.
10049                     # It's possible that there is more than one such line
10050                     # waiting for us; collect them all, and parse
10051                     my @missings_list = $file->get_missings
10052                                             if $file->has_missings_defaults;
10053                     foreach my $default_ref (@missings_list) {
10054                         my $default = $default_ref->[0];
10055                         my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
10056
10057                         # For string properties, the default is just what the
10058                         # file says, but non-string properties should already
10059                         # have set up a table for the default property value;
10060                         # use the table for these, so can resolve synonyms
10061                         # later to a single standard one.
10062                         if ($property_type == $STRING
10063                             || $property_type == $UNKNOWN)
10064                         {
10065                             $property_info{$addr}{$MISSINGS} = $default;
10066                         }
10067                         else {
10068                             $property_info{$addr}{$MISSINGS}
10069                                         = $property_object->table($default);
10070                         }
10071                     }
10072
10073                     # Finished storing all the @missings defaults in the input
10074                     # file so far.  Get the one for the current property.
10075                     my $missings = $property_info{$property_addr}{$MISSINGS};
10076
10077                     # But we likely have separately stored what the default
10078                     # should be.  (This is to accommodate versions of the
10079                     # standard where the @missings lines are absent or
10080                     # incomplete.)  Hopefully the two will match.  But check
10081                     # it out.
10082                     $default_map = $property_object->default_map;
10083
10084                     # If the map is a ref, it means that the default won't be
10085                     # processed until later, so undef it, so next few lines
10086                     # will redefine it to something that nothing will match
10087                     undef $default_map if ref $default_map;
10088
10089                     # Create a $default_map if don't have one; maybe a dummy
10090                     # that won't match anything.
10091                     if (! defined $default_map) {
10092
10093                         # Use any @missings line in the file.
10094                         if (defined $missings) {
10095                             if (ref $missings) {
10096                                 $default_map = $missings->full_name;
10097                                 $default_table = $missings;
10098                             }
10099                             else {
10100                                 $default_map = $missings;
10101                             }
10102
10103                             # And store it with the property for outside use.
10104                             $property_object->set_default_map($default_map);
10105                         }
10106                         else {
10107
10108                             # Neither an @missings nor a default map.  Create
10109                             # a dummy one, so won't have to test definedness
10110                             # in the main loop.
10111                             $default_map = '_Perl This will never be in a file
10112                                             from Unicode';
10113                         }
10114                     }
10115
10116                     # Here, we have $default_map defined, possibly in terms of
10117                     # $missings, but maybe not, and possibly is a dummy one.
10118                     if (defined $missings) {
10119
10120                         # Make sure there is no conflict between the two.
10121                         # $missings has priority.
10122                         if (ref $missings) {
10123                             $default_table
10124                                         = $property_object->table($default_map);
10125                             if (! defined $default_table
10126                                 || $default_table != $missings)
10127                             {
10128                                 if (! defined $default_table) {
10129                                     $default_table = $UNDEF;
10130                                 }
10131                                 $file->carp_bad_line(<<END
10132 The \@missings line for $property_name in $file says that missings default to
10133 $missings, but we expect it to be $default_table.  $missings used.
10134 END
10135                                 );
10136                                 $default_table = $missings;
10137                                 $default_map = $missings->full_name;
10138                             }
10139                             $property_info{$property_addr}{$DEFAULT_TABLE}
10140                                                         = $default_table;
10141                         }
10142                         elsif ($default_map ne $missings) {
10143                             $file->carp_bad_line(<<END
10144 The \@missings line for $property_name in $file says that missings default to
10145 $missings, but we expect it to be $default_map.  $missings used.
10146 END
10147                             );
10148                             $default_map = $missings;
10149                         }
10150                     }
10151
10152                     $property_info{$property_addr}{$DEFAULT_MAP}
10153                                                     = $default_map;
10154
10155                     # If haven't done so already, find the table corresponding
10156                     # to this map for non-string properties.
10157                     if (! defined $default_table
10158                         && $property_type != $STRING
10159                         && $property_type != $UNKNOWN)
10160                     {
10161                         $default_table = $property_info{$property_addr}
10162                                                         {$DEFAULT_TABLE}
10163                                     = $property_object->table($default_map);
10164                     }
10165                 } # End of is first time for this property
10166             } # End of switching properties.
10167
10168             # Ready to process the line.
10169             # The Unicode files are set up so that if the map is not defined,
10170             # it is a binary property with value 'Y'
10171             if (! defined $map) {
10172                 $map = 'Y';
10173             }
10174             else {
10175
10176                 # If the map begins with a special command to us (enclosed in
10177                 # delimiters), extract the command(s).
10178                 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
10179                     my $command = $1;
10180                     if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
10181                         $replace = $1;
10182                     }
10183                     elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
10184                         $map_type = $1;
10185                     }
10186                     else {
10187                         $file->carp_bad_line("Unknown command line: '$1'");
10188                         next LINE;
10189                     }
10190                 }
10191             }
10192
10193             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
10194             {
10195
10196                 # Here, we have a map to a particular code point, and the
10197                 # default map is to a code point itself.  If the range
10198                 # includes the particular code point, change that portion of
10199                 # the range to the default.  This makes sure that in the final
10200                 # table only the non-defaults are listed.
10201                 my $decimal_map = hex $map;
10202                 if ($low <= $decimal_map && $decimal_map <= $high) {
10203
10204                     # If the range includes stuff before or after the map
10205                     # we're changing, split it and process the split-off parts
10206                     # later.
10207                     if ($low < $decimal_map) {
10208                         $file->insert_adjusted_lines(
10209                                             sprintf("%04X..%04X; %s; %s",
10210                                                     $low,
10211                                                     $decimal_map - 1,
10212                                                     $property_name,
10213                                                     $map));
10214                     }
10215                     if ($high > $decimal_map) {
10216                         $file->insert_adjusted_lines(
10217                                             sprintf("%04X..%04X; %s; %s",
10218                                                     $decimal_map + 1,
10219                                                     $high,
10220                                                     $property_name,
10221                                                     $map));
10222                     }
10223                     $low = $high = $decimal_map;
10224                     $map = $CODE_POINT;
10225                 }
10226             }
10227
10228             # If we can tell that this is a synonym for the default map, use
10229             # the default one instead.
10230             if ($property_type != $STRING
10231                 && $property_type != $UNKNOWN)
10232             {
10233                 my $table = $property_object->table($map);
10234                 if (defined $table && $table == $default_table) {
10235                     $map = $default_map;
10236                 }
10237             }
10238
10239             # And figure out the map type if not known.
10240             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
10241                 if ($map eq "") {   # Nulls are always $NULL map type
10242                     $map_type = $NULL;
10243                 } # Otherwise, non-strings, and those that don't allow
10244                   # $MULTI_CP, and those that aren't multiple code points are
10245                   # 0
10246                 elsif
10247                    (($property_type != $STRING && $property_type != $UNKNOWN)
10248                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
10249                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
10250                 {
10251                     $map_type = 0;
10252                 }
10253                 else {
10254                     $map_type = $MULTI_CP;
10255                 }
10256             }
10257
10258             $property_object->add_map($low, $high,
10259                                         $map,
10260                                         Type => $map_type,
10261                                         Replace => $replace);
10262         } # End of loop through file's lines
10263
10264         return;
10265     }
10266 }
10267
10268 { # Closure for UnicodeData.txt handling
10269
10270     # This file was the first one in the UCD; its design leads to some
10271     # awkwardness in processing.  Here is a sample line:
10272     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
10273     # The fields in order are:
10274     my $i = 0;            # The code point is in field 0, and is shifted off.
10275     my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
10276     my $CATEGORY = $i++;  # category (e.g. "Lu")
10277     my $CCC = $i++;       # Canonical combining class (e.g. "230")
10278     my $BIDI = $i++;      # directional class (e.g. "L")
10279     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
10280     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
10281     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
10282                                          # Dual-use in this program; see below
10283     my $NUMERIC = $i++;   # numeric value
10284     my $MIRRORED = $i++;  # ? mirrored
10285     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
10286     my $COMMENT = $i++;   # iso comment
10287     my $UPPER = $i++;     # simple uppercase mapping
10288     my $LOWER = $i++;     # simple lowercase mapping
10289     my $TITLE = $i++;     # simple titlecase mapping
10290     my $input_field_count = $i;
10291
10292     # This routine in addition outputs these extra fields:
10293
10294     my $DECOMP_TYPE = $i++; # Decomposition type
10295
10296     # These fields are modifications of ones above, and are usually
10297     # suppressed; they must come last, as for speed, the loop upper bound is
10298     # normally set to ignore them
10299     my $NAME = $i++;        # This is the strict name field, not the one that
10300                             # charnames uses.
10301     my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
10302                             # by Unicode::Normalize
10303     my $last_field = $i - 1;
10304
10305     # All these are read into an array for each line, with the indices defined
10306     # above.  The empty fields in the example line above indicate that the
10307     # value is defaulted.  The handler called for each line of the input
10308     # changes these to their defaults.
10309
10310     # Here are the official names of the properties, in a parallel array:
10311     my @field_names;
10312     $field_names[$BIDI] = 'Bidi_Class';
10313     $field_names[$CATEGORY] = 'General_Category';
10314     $field_names[$CCC] = 'Canonical_Combining_Class';
10315     $field_names[$CHARNAME] = 'Perl_Charnames';
10316     $field_names[$COMMENT] = 'ISO_Comment';
10317     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
10318     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
10319     $field_names[$LOWER] = 'Lowercase_Mapping';
10320     $field_names[$MIRRORED] = 'Bidi_Mirrored';
10321     $field_names[$NAME] = 'Name';
10322     $field_names[$NUMERIC] = 'Numeric_Value';
10323     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
10324     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
10325     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
10326     $field_names[$TITLE] = 'Titlecase_Mapping';
10327     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
10328     $field_names[$UPPER] = 'Uppercase_Mapping';
10329
10330     # Some of these need a little more explanation:
10331     # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
10332     #   property, but is used in calculating the Numeric_Type.  Perl however,
10333     #   creates a file from this field, so a Perl property is created from it.
10334     # Similarly, the Other_Digit field is used only for calculating the
10335     #   Numeric_Type, and so it can be safely re-used as the place to store
10336     #   the value for Numeric_Type; hence it is referred to as
10337     #   $NUMERIC_TYPE_OTHER_DIGIT.
10338     # The input field named $PERL_DECOMPOSITION is a combination of both the
10339     #   decomposition mapping and its type.  Perl creates a file containing
10340     #   exactly this field, so it is used for that.  The two properties are
10341     #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
10342     #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
10343     #   output it), as Perl doesn't use it directly.
10344     # The input field named here $CHARNAME is used to construct the
10345     #   Perl_Charnames property, which is a combination of the Name property
10346     #   (which the input field contains), and the Unicode_1_Name property, and
10347     #   others from other files.  Since, the strict Name property is not used
10348     #   by Perl, this field is used for the table that Perl does use.  The
10349     #   strict Name property table is usually suppressed (unless the lists are
10350     #   changed to output it), so it is accumulated in a separate field,
10351     #   $NAME, which to save time is discarded unless the table is actually to
10352     #   be output
10353
10354     # This file is processed like most in this program.  Control is passed to
10355     # process_generic_property_file() which calls filter_UnicodeData_line()
10356     # for each input line.  This filter converts the input into line(s) that
10357     # process_generic_property_file() understands.  There is also a setup
10358     # routine called before any of the file is processed, and a handler for
10359     # EOF processing, all in this closure.
10360
10361     # A huge speed-up occurred at the cost of some added complexity when these
10362     # routines were altered to buffer the outputs into ranges.  Almost all the
10363     # lines of the input file apply to just one code point, and for most
10364     # properties, the map for the next code point up is the same as the
10365     # current one.  So instead of creating a line for each property for each
10366     # input line, filter_UnicodeData_line() remembers what the previous map
10367     # of a property was, and doesn't generate a line to pass on until it has
10368     # to, as when the map changes; and that passed-on line encompasses the
10369     # whole contiguous range of code points that have the same map for that
10370     # property.  This means a slight amount of extra setup, and having to
10371     # flush these buffers on EOF, testing if the maps have changed, plus
10372     # remembering state information in the closure.  But it means a lot less
10373     # real time in not having to change the data base for each property on
10374     # each line.
10375
10376     # Another complication is that there are already a few ranges designated
10377     # in the input.  There are two lines for each, with the same maps except
10378     # the code point and name on each line.  This was actually the hardest
10379     # thing to design around.  The code points in those ranges may actually
10380     # have real maps not given by these two lines.  These maps will either
10381     # be algorithmically determinable, or be in the extracted files furnished
10382     # with the UCD.  In the event of conflicts between these extracted files,
10383     # and this one, Unicode says that this one prevails.  But it shouldn't
10384     # prevail for conflicts that occur in these ranges.  The data from the
10385     # extracted files prevails in those cases.  So, this program is structured
10386     # so that those files are processed first, storing maps.  Then the other
10387     # files are processed, generally overwriting what the extracted files
10388     # stored.  But just the range lines in this input file are processed
10389     # without overwriting.  This is accomplished by adding a special string to
10390     # the lines output to tell process_generic_property_file() to turn off the
10391     # overwriting for just this one line.
10392     # A similar mechanism is used to tell it that the map is of a non-default
10393     # type.
10394
10395     sub setup_UnicodeData { # Called before any lines of the input are read
10396         my $file = shift;
10397         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10398
10399         # Create a new property specially located that is a combination of the
10400         # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10401         # Name_Alias properties.  (The final duplicates elements of the
10402         # first.)  A comment for it will later be constructed based on the
10403         # actual properties present and used
10404         $perl_charname = Property->new('Perl_Charnames',
10405                        Default_Map => "",
10406                        Directory => File::Spec->curdir(),
10407                        File => 'Name',
10408                        Fate => $INTERNAL_ONLY,
10409                        Perl_Extension => 1,
10410                        Range_Size_1 => \&output_perl_charnames_line,
10411                        Type => $STRING,
10412                        );
10413         $perl_charname->set_proxy_for('Name');
10414
10415         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
10416                                         Directory => File::Spec->curdir(),
10417                                         File => 'Decomposition',
10418                                         Format => $DECOMP_STRING_FORMAT,
10419                                         Fate => $INTERNAL_ONLY,
10420                                         Perl_Extension => 1,
10421                                         Default_Map => $CODE_POINT,
10422
10423                                         # normalize.pm can't cope with these
10424                                         Output_Range_Counts => 0,
10425
10426                                         # This is a specially formatted table
10427                                         # explicitly for normalize.pm, which
10428                                         # is expecting a particular format,
10429                                         # which means that mappings containing
10430                                         # multiple code points are in the main
10431                                         # body of the table
10432                                         Map_Type => $COMPUTE_NO_MULTI_CP,
10433                                         Type => $STRING,
10434                                         To_Output_Map => $INTERNAL_MAP,
10435                                         );
10436         $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
10437         $Perl_decomp->add_comment(join_lines(<<END
10438 This mapping is a combination of the Unicode 'Decomposition_Type' and
10439 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
10440 identical to the official Unicode 'Decomposition_Mapping' property except for
10441 two things:
10442  1) It omits the algorithmically determinable Hangul syllable decompositions,
10443 which normalize.pm handles algorithmically.
10444  2) It contains the decomposition type as well.  Non-canonical decompositions
10445 begin with a word in angle brackets, like <super>, which denotes the
10446 compatible decomposition type.  If the map does not begin with the <angle
10447 brackets>, the decomposition is canonical.
10448 END
10449         ));
10450
10451         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
10452                                         Default_Map => "",
10453                                         Perl_Extension => 1,
10454                                         Directory => $map_directory,
10455                                         Type => $STRING,
10456                                         To_Output_Map => $OUTPUT_ADJUSTED,
10457                                         );
10458         $Decimal_Digit->add_comment(join_lines(<<END
10459 This file gives the mapping of all code points which represent a single
10460 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
10461 points, and the mapping of each non-initial element of each range is actually
10462 not to "0", but to the offset that element has from its corresponding DIGIT 0.
10463 These code points are those that have Numeric_Type=Decimal; not special
10464 things, like subscripts nor Roman numerals.
10465 END
10466         ));
10467
10468         # These properties are not used for generating anything else, and are
10469         # usually not output.  By making them last in the list, we can just
10470         # change the high end of the loop downwards to avoid the work of
10471         # generating a table(s) that is/are just going to get thrown away.
10472         if (! property_ref('Decomposition_Mapping')->to_output_map
10473             && ! property_ref('Name')->to_output_map)
10474         {
10475             $last_field = min($NAME, $DECOMP_MAP) - 1;
10476         } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
10477             $last_field = $DECOMP_MAP;
10478         } elsif (property_ref('Name')->to_output_map) {
10479             $last_field = $NAME;
10480         }
10481         return;
10482     }
10483
10484     my $first_time = 1;                 # ? Is this the first line of the file
10485     my $in_range = 0;                   # ? Are we in one of the file's ranges
10486     my $previous_cp;                    # hex code point of previous line
10487     my $decimal_previous_cp = -1;       # And its decimal equivalent
10488     my @start;                          # For each field, the current starting
10489                                         # code point in hex for the range
10490                                         # being accumulated.
10491     my @fields;                         # The input fields;
10492     my @previous_fields;                # And those from the previous call
10493
10494     sub filter_UnicodeData_line {
10495         # Handle a single input line from UnicodeData.txt; see comments above
10496         # Conceptually this takes a single line from the file containing N
10497         # properties, and converts it into N lines with one property per line,
10498         # which is what the final handler expects.  But there are
10499         # complications due to the quirkiness of the input file, and to save
10500         # time, it accumulates ranges where the property values don't change
10501         # and only emits lines when necessary.  This is about an order of
10502         # magnitude fewer lines emitted.
10503
10504         my $file = shift;
10505         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10506
10507         # $_ contains the input line.
10508         # -1 in split means retain trailing null fields
10509         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10510
10511         #local $to_trace = 1 if main::DEBUG;
10512         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10513         if (@fields > $input_field_count) {
10514             $file->carp_bad_line('Extra fields');
10515             $_ = "";
10516             return;
10517         }
10518
10519         my $decimal_cp = hex $cp;
10520
10521         # We have to output all the buffered ranges when the next code point
10522         # is not exactly one after the previous one, which means there is a
10523         # gap in the ranges.
10524         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10525
10526         # The decomposition mapping field requires special handling.  It looks
10527         # like either:
10528         #
10529         # <compat> 0032 0020
10530         # 0041 0300
10531         #
10532         # The decomposition type is enclosed in <brackets>; if missing, it
10533         # means the type is canonical.  There are two decomposition mapping
10534         # tables: the one for use by Perl's normalize.pm has a special format
10535         # which is this field intact; the other, for general use is of
10536         # standard format.  In either case we have to find the decomposition
10537         # type.  Empty fields have None as their type, and map to the code
10538         # point itself
10539         if ($fields[$PERL_DECOMPOSITION] eq "") {
10540             $fields[$DECOMP_TYPE] = 'None';
10541             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10542         }
10543         else {
10544             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10545                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
10546             if (! defined $fields[$DECOMP_TYPE]) {
10547                 $fields[$DECOMP_TYPE] = 'Canonical';
10548                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10549             }
10550             else {
10551                 $fields[$DECOMP_MAP] = $map;
10552             }
10553         }
10554
10555         # The 3 numeric fields also require special handling.  The 2 digit
10556         # fields must be either empty or match the number field.  This means
10557         # that if it is empty, they must be as well, and the numeric type is
10558         # None, and the numeric value is 'Nan'.
10559         # The decimal digit field must be empty or match the other digit
10560         # field.  If the decimal digit field is non-empty, the code point is
10561         # a decimal digit, and the other two fields will have the same value.
10562         # If it is empty, but the other digit field is non-empty, the code
10563         # point is an 'other digit', and the number field will have the same
10564         # value as the other digit field.  If the other digit field is empty,
10565         # but the number field is non-empty, the code point is a generic
10566         # numeric type.
10567         if ($fields[$NUMERIC] eq "") {
10568             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10569                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10570             ) {
10571                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
10572             }
10573             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10574             $fields[$NUMERIC] = 'NaN';
10575         }
10576         else {
10577             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10578             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10579                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10580                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
10581                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10582             }
10583             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10584                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10585                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10586             }
10587             else {
10588                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10589
10590                 # Rationals require extra effort.
10591                 register_fraction($fields[$NUMERIC])
10592                                                 if $fields[$NUMERIC] =~ qr{/};
10593             }
10594         }
10595
10596         # For the properties that have empty fields in the file, and which
10597         # mean something different from empty, change them to that default.
10598         # Certain fields just haven't been empty so far in any Unicode
10599         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10600         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
10601         # the defaults; which are very unlikely to ever change.
10602         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10603         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10604
10605         # UAX44 says that if title is empty, it is the same as whatever upper
10606         # is,
10607         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10608
10609         # There are a few pairs of lines like:
10610         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10611         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10612         # that define ranges.  These should be processed after the fields are
10613         # adjusted above, as they may override some of them; but mostly what
10614         # is left is to possibly adjust the $CHARNAME field.  The names of all the
10615         # paired lines start with a '<', but this is also true of '<control>,
10616         # which isn't one of these special ones.
10617         if ($fields[$CHARNAME] eq '<control>') {
10618
10619             # Some code points in this file have the pseudo-name
10620             # '<control>', but the official name for such ones is the null
10621             # string.
10622             $fields[$NAME] = $fields[$CHARNAME] = "";
10623
10624             # We had better not be in between range lines.
10625             if ($in_range) {
10626                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10627                 $in_range = 0;
10628             }
10629         }
10630         elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
10631
10632             # Here is a non-range line.  We had better not be in between range
10633             # lines.
10634             if ($in_range) {
10635                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10636                 $in_range = 0;
10637             }
10638             if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10639
10640                 # These are code points whose names end in their code points,
10641                 # which means the names are algorithmically derivable from the
10642                 # code points.  To shorten the output Name file, the algorithm
10643                 # for deriving these is placed in the file instead of each
10644                 # code point, so they have map type $CP_IN_NAME
10645                 $fields[$CHARNAME] = $CMD_DELIM
10646                                  . $MAP_TYPE_CMD
10647                                  . '='
10648                                  . $CP_IN_NAME
10649                                  . $CMD_DELIM
10650                                  . $fields[$CHARNAME];
10651             }
10652             $fields[$NAME] = $fields[$CHARNAME];
10653         }
10654         elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10655             $fields[$CHARNAME] = $fields[$NAME] = $1;
10656
10657             # Here we are at the beginning of a range pair.
10658             if ($in_range) {
10659                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
10660             }
10661             $in_range = 1;
10662
10663             # Because the properties in the range do not overwrite any already
10664             # in the db, we must flush the buffers of what's already there, so
10665             # they get handled in the normal scheme.
10666             $force_output = 1;
10667
10668         }
10669         elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10670             $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
10671             $_ = "";
10672             return;
10673         }
10674         else { # Here, we are at the last line of a range pair.
10675
10676             if (! $in_range) {
10677                 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
10678                 $_ = "";
10679                 return;
10680             }
10681             $in_range = 0;
10682
10683             $fields[$NAME] = $fields[$CHARNAME];
10684
10685             # Check that the input is valid: that the closing of the range is
10686             # the same as the beginning.
10687             foreach my $i (0 .. $last_field) {
10688                 next if $fields[$i] eq $previous_fields[$i];
10689                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
10690             }
10691
10692             # The processing differs depending on the type of range,
10693             # determined by its $CHARNAME
10694             if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
10695
10696                 # Check that the data looks right.
10697                 if ($decimal_previous_cp != $SBase) {
10698                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
10699                 }
10700                 if ($decimal_cp != $SBase + $SCount - 1) {
10701                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
10702                 }
10703
10704                 # The Hangul syllable range has a somewhat complicated name
10705                 # generation algorithm.  Each code point in it has a canonical
10706                 # decomposition also computable by an algorithm.  The
10707                 # perl decomposition map table built from these is used only
10708                 # by normalize.pm, which has the algorithm built in it, so the
10709                 # decomposition maps are not needed, and are large, so are
10710                 # omitted from it.  If the full decomposition map table is to
10711                 # be output, the decompositions are generated for it, in the
10712                 # EOF handling code for this input file.
10713
10714                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10715
10716                 # This range is stored in our internal structure with its
10717                 # own map type, different from all others.
10718                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10719                                         = $CMD_DELIM
10720                                           . $MAP_TYPE_CMD
10721                                           . '='
10722                                           . $HANGUL_SYLLABLE
10723                                           . $CMD_DELIM
10724                                           . $fields[$CHARNAME];
10725             }
10726             elsif ($fields[$CHARNAME] =~ /^CJK/) {
10727
10728                 # The name for these contains the code point itself, and all
10729                 # are defined to have the same base name, regardless of what
10730                 # is in the file.  They are stored in our internal structure
10731                 # with a map type of $CP_IN_NAME
10732                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10733                                         = $CMD_DELIM
10734                                            . $MAP_TYPE_CMD
10735                                            . '='
10736                                            . $CP_IN_NAME
10737                                            . $CMD_DELIM
10738                                            . 'CJK UNIFIED IDEOGRAPH';
10739
10740             }
10741             elsif ($fields[$CATEGORY] eq 'Co'
10742                      || $fields[$CATEGORY] eq 'Cs')
10743             {
10744                 # The names of all the code points in these ranges are set to
10745                 # null, as there are no names for the private use and
10746                 # surrogate code points.
10747
10748                 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
10749             }
10750             else {
10751                 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
10752             }
10753
10754             # The first line of the range caused everything else to be output,
10755             # and then its values were stored as the beginning values for the
10756             # next set of ranges, which this one ends.  Now, for each value,
10757             # add a command to tell the handler that these values should not
10758             # replace any existing ones in our database.
10759             foreach my $i (0 .. $last_field) {
10760                 $previous_fields[$i] = $CMD_DELIM
10761                                         . $REPLACE_CMD
10762                                         . '='
10763                                         . $NO
10764                                         . $CMD_DELIM
10765                                         . $previous_fields[$i];
10766             }
10767
10768             # And change things so it looks like the entire range has been
10769             # gone through with this being the final part of it.  Adding the
10770             # command above to each field will cause this range to be flushed
10771             # during the next iteration, as it guaranteed that the stored
10772             # field won't match whatever value the next one has.
10773             $previous_cp = $cp;
10774             $decimal_previous_cp = $decimal_cp;
10775
10776             # We are now set up for the next iteration; so skip the remaining
10777             # code in this subroutine that does the same thing, but doesn't
10778             # know about these ranges.
10779             $_ = "";
10780
10781             return;
10782         }
10783
10784         # On the very first line, we fake it so the code below thinks there is
10785         # nothing to output, and initialize so that when it does get output it
10786         # uses the first line's values for the lowest part of the range.
10787         # (One could avoid this by using peek(), but then one would need to
10788         # know the adjustments done above and do the same ones in the setup
10789         # routine; not worth it)
10790         if ($first_time) {
10791             $first_time = 0;
10792             @previous_fields = @fields;
10793             @start = ($cp) x scalar @fields;
10794             $decimal_previous_cp = $decimal_cp - 1;
10795         }
10796
10797         # For each field, output the stored up ranges that this code point
10798         # doesn't fit in.  Earlier we figured out if all ranges should be
10799         # terminated because of changing the replace or map type styles, or if
10800         # there is a gap between this new code point and the previous one, and
10801         # that is stored in $force_output.  But even if those aren't true, we
10802         # need to output the range if this new code point's value for the
10803         # given property doesn't match the stored range's.
10804         #local $to_trace = 1 if main::DEBUG;
10805         foreach my $i (0 .. $last_field) {
10806             my $field = $fields[$i];
10807             if ($force_output || $field ne $previous_fields[$i]) {
10808
10809                 # Flush the buffer of stored values.
10810                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10811
10812                 # Start a new range with this code point and its value
10813                 $start[$i] = $cp;
10814                 $previous_fields[$i] = $field;
10815             }
10816         }
10817
10818         # Set the values for the next time.
10819         $previous_cp = $cp;
10820         $decimal_previous_cp = $decimal_cp;
10821
10822         # The input line has generated whatever adjusted lines are needed, and
10823         # should not be looked at further.
10824         $_ = "";
10825         return;
10826     }
10827
10828     sub EOF_UnicodeData {
10829         # Called upon EOF to flush the buffers, and create the Hangul
10830         # decomposition mappings if needed.
10831
10832         my $file = shift;
10833         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10834
10835         # Flush the buffers.
10836         foreach my $i (0 .. $last_field) {
10837             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10838         }
10839
10840         if (-e 'Jamo.txt') {
10841
10842             # The algorithm is published by Unicode, based on values in
10843             # Jamo.txt, (which should have been processed before this
10844             # subroutine), and the results left in %Jamo
10845             unless (%Jamo) {
10846                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
10847                 return;
10848             }
10849
10850             # If the full decomposition map table is being output, insert
10851             # into it the Hangul syllable mappings.  This is to avoid having
10852             # to publish a subroutine in it to compute them.  (which would
10853             # essentially be this code.)  This uses the algorithm published by
10854             # Unicode.  (No hangul syllables in version 1)
10855             if ($v_version ge v2.0.0
10856                 && property_ref('Decomposition_Mapping')->to_output_map) {
10857                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10858                     use integer;
10859                     my $SIndex = $S - $SBase;
10860                     my $L = $LBase + $SIndex / $NCount;
10861                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
10862                     my $T = $TBase + $SIndex % $TCount;
10863
10864                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10865                     my $decomposition = sprintf("%04X %04X", $L, $V);
10866                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10867                     $file->insert_adjusted_lines(
10868                                 sprintf("%04X; Decomposition_Mapping; %s",
10869                                         $S,
10870                                         $decomposition));
10871                 }
10872             }
10873         }
10874
10875         return;
10876     }
10877
10878     sub filter_v1_ucd {
10879         # Fix UCD lines in version 1.  This is probably overkill, but this
10880         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
10881         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
10882         #       removed.  This program retains them
10883         # 2)    didn't include ranges, which it should have, and which are now
10884         #       added in @corrected_lines below.  It was hand populated by
10885         #       taking the data from Version 2, verified by analyzing
10886         #       DAge.txt.
10887         # 3)    There is a syntax error in the entry for U+09F8 which could
10888         #       cause problems for utf8_heavy, and so is changed.  It's
10889         #       numeric value was simply a minus sign, without any number.
10890         #       (Eventually Unicode changed the code point to non-numeric.)
10891         # 4)    The decomposition types often don't match later versions
10892         #       exactly, and the whole syntax of that field is different; so
10893         #       the syntax is changed as well as the types to their later
10894         #       terminology.  Otherwise normalize.pm would be very unhappy
10895         # 5)    Many ccc classes are different.  These are left intact.
10896         # 6)    U+FF10..U+FF19 are missing their numeric values in all three
10897         #       fields.  These are unchanged because it doesn't really cause
10898         #       problems for Perl.
10899         # 7)    A number of code points, such as controls, don't have their
10900         #       Unicode Version 1 Names in this file.  These are added.
10901         # 8)    A number of Symbols were marked as Lm.  This changes those in
10902         #       the Latin1 range, so that regexes work.
10903         # 9)    The odd characters U+03DB .. U+03E1 weren't encoded but are
10904         #       referred to by their lc equivalents.  Not fixed.
10905
10906         my @corrected_lines = split /\n/, <<'END';
10907 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
10908 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10909 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10910 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10911 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10912 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10913 END
10914
10915         my $file = shift;
10916         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10917
10918         #local $to_trace = 1 if main::DEBUG;
10919         trace $_ if main::DEBUG && $to_trace;
10920
10921         # -1 => retain trailing null fields
10922         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10923
10924         # At the first place that is wrong in the input, insert all the
10925         # corrections, replacing the wrong line.
10926         if ($code_point eq '4E00') {
10927             my @copy = @corrected_lines;
10928             $_ = shift @copy;
10929             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10930
10931             $file->insert_lines(@copy);
10932         }
10933         elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
10934
10935             # There are no Lm characters in Latin1; these should be 'Sk', but
10936             # there isn't that in V1.
10937             $fields[$CATEGORY] = 'So';
10938         }
10939
10940         if ($fields[$NUMERIC] eq '-') {
10941             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
10942         }
10943
10944         if  ($fields[$PERL_DECOMPOSITION] ne "") {
10945
10946             # Several entries have this change to superscript 2 or 3 in the
10947             # middle.  Convert these to the modern version, which is to use
10948             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10949             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10950             # 'HHHH HHHH 00B3 HHHH'.
10951             # It turns out that all of these that don't have another
10952             # decomposition defined at the beginning of the line have the
10953             # <square> decomposition in later releases.
10954             if ($code_point ne '00B2' && $code_point ne '00B3') {
10955                 if  ($fields[$PERL_DECOMPOSITION]
10956                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10957                 {
10958                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10959                         $fields[$PERL_DECOMPOSITION] = '<square> '
10960                         . $fields[$PERL_DECOMPOSITION];
10961                     }
10962                 }
10963             }
10964
10965             # If is like '<+circled> 0052 <-circled>', convert to
10966             # '<circled> 0052'
10967             $fields[$PERL_DECOMPOSITION] =~
10968                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
10969
10970             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10971             $fields[$PERL_DECOMPOSITION] =~
10972                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10973             or $fields[$PERL_DECOMPOSITION] =~
10974                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10975             or $fields[$PERL_DECOMPOSITION] =~
10976                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10977             or $fields[$PERL_DECOMPOSITION] =~
10978                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10979
10980             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10981             $fields[$PERL_DECOMPOSITION] =~
10982                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10983
10984             # Change names to modern form.
10985             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10986             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10987             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10988             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10989
10990             # One entry has weird braces
10991             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10992
10993             # One entry at U+2116 has an extra <sup>
10994             $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
10995         }
10996
10997         $_ = join ';', $code_point, @fields;
10998         trace $_ if main::DEBUG && $to_trace;
10999         return;
11000     }
11001
11002     sub filter_bad_Nd_ucd {
11003         # Early versions specified a value in the decimal digit field even
11004         # though the code point wasn't a decimal digit.  Clear the field in
11005         # that situation, so that the main code doesn't think it is a decimal
11006         # digit.
11007
11008         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11009         if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
11010             $fields[$PERL_DECIMAL_DIGIT] = "";
11011             $_ = join ';', $code_point, @fields;
11012         }
11013         return;
11014     }
11015
11016     my @U1_control_names = split /\n/, <<'END';
11017 NULL
11018 START OF HEADING
11019 START OF TEXT
11020 END OF TEXT
11021 END OF TRANSMISSION
11022 ENQUIRY
11023 ACKNOWLEDGE
11024 BELL
11025 BACKSPACE
11026 HORIZONTAL TABULATION
11027 LINE FEED
11028 VERTICAL TABULATION
11029 FORM FEED
11030 CARRIAGE RETURN
11031 SHIFT OUT
11032 SHIFT IN
11033 DATA LINK ESCAPE
11034 DEVICE CONTROL ONE
11035 DEVICE CONTROL TWO
11036 DEVICE CONTROL THREE
11037 DEVICE CONTROL FOUR
11038 NEGATIVE ACKNOWLEDGE
11039 SYNCHRONOUS IDLE
11040 END OF TRANSMISSION BLOCK
11041 CANCEL
11042 END OF MEDIUM
11043 SUBSTITUTE
11044 ESCAPE
11045 FILE SEPARATOR
11046 GROUP SEPARATOR
11047 RECORD SEPARATOR
11048 UNIT SEPARATOR
11049 DELETE
11050 BREAK PERMITTED HERE
11051 NO BREAK HERE
11052 INDEX
11053 NEXT LINE
11054 START OF SELECTED AREA
11055 END OF SELECTED AREA
11056 CHARACTER TABULATION SET
11057 CHARACTER TABULATION WITH JUSTIFICATION
11058 LINE TABULATION SET
11059 PARTIAL LINE DOWN
11060 PARTIAL LINE UP
11061 REVERSE LINE FEED
11062 SINGLE SHIFT TWO
11063 SINGLE SHIFT THREE
11064 DEVICE CONTROL STRING
11065 PRIVATE USE ONE
11066 PRIVATE USE TWO
11067 SET TRANSMIT STATE
11068 CANCEL CHARACTER
11069 MESSAGE WAITING
11070 START OF GUARDED AREA
11071 END OF GUARDED AREA
11072 START OF STRING
11073 SINGLE CHARACTER INTRODUCER
11074 CONTROL SEQUENCE INTRODUCER
11075 STRING TERMINATOR
11076 OPERATING SYSTEM COMMAND
11077 PRIVACY MESSAGE
11078 APPLICATION PROGRAM COMMAND
11079 END
11080
11081     sub filter_early_U1_names {
11082         # Very early versions did not have the Unicode_1_name field specified.
11083         # They differed in which ones were present; make sure a U1 name
11084         # exists, so that Unicode::UCD::charinfo will work
11085
11086         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11087
11088
11089         # @U1_control names above are entirely positional, so we pull them out
11090         # in the exact order required, with gaps for the ones that don't have
11091         # names.
11092         if ($code_point =~ /^00[01]/
11093             || $code_point eq '007F'
11094             || $code_point =~ /^008[2-9A-F]/
11095             || $code_point =~ /^009[0-8A-F]/)
11096         {
11097             my $u1_name = shift @U1_control_names;
11098             $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
11099             $_ = join ';', $code_point, @fields;
11100         }
11101         return;
11102     }
11103
11104     sub filter_v2_1_5_ucd {
11105         # A dozen entries in this 2.1.5 file had the mirrored and numeric
11106         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
11107         # column appears to be N, swap it back.
11108
11109         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11110         if ($fields[$NUMERIC] eq 'N') {
11111             $fields[$NUMERIC] = $fields[$MIRRORED];
11112             $fields[$MIRRORED] = 'N';
11113             $_ = join ';', $code_point, @fields;
11114         }
11115         return;
11116     }
11117
11118     sub filter_v6_ucd {
11119
11120         # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
11121         # it wasn't accepted, to allow for some deprecation cycles.  This
11122         # function is not called after 5.16
11123
11124         return if $_ !~ /^(?:0007|1F514|070F);/;
11125
11126         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11127         if ($code_point eq '0007') {
11128             $fields[$CHARNAME] = "";
11129         }
11130         elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
11131                             # http://www.unicode.org/versions/corrigendum8.html
11132             $fields[$BIDI] = "AL";
11133         }
11134         elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
11135             $fields[$CHARNAME] = "";
11136         }
11137
11138         $_ = join ';', $code_point, @fields;
11139
11140         return;
11141     }
11142 } # End closure for UnicodeData
11143
11144 sub process_GCB_test {
11145
11146     my $file = shift;
11147     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11148
11149     while ($file->next_line) {
11150         push @backslash_X_tests, $_;
11151     }
11152
11153     return;
11154 }
11155
11156 sub process_NamedSequences {
11157     # NamedSequences.txt entries are just added to an array.  Because these
11158     # don't look like the other tables, they have their own handler.
11159     # An example:
11160     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
11161     #
11162     # This just adds the sequence to an array for later handling
11163
11164     my $file = shift;
11165     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11166
11167     while ($file->next_line) {
11168         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
11169         if (@remainder) {
11170             $file->carp_bad_line(
11171                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
11172             next;
11173         }
11174
11175         # Note single \t in keeping with special output format of
11176         # Perl_charnames.  But it turns out that the code points don't have to
11177         # be 5 digits long, like the rest, based on the internal workings of
11178         # charnames.pm.  This could be easily changed for consistency.
11179         push @named_sequences, "$sequence\t$name";
11180     }
11181     return;
11182 }
11183
11184 { # Closure
11185
11186     my $first_range;
11187
11188     sub  filter_early_ea_lb {
11189         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
11190         # third field be the name of the code point, which can be ignored in
11191         # most cases.  But it can be meaningful if it marks a range:
11192         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
11193         # 3400;W;<CJK Ideograph Extension A, First>
11194         #
11195         # We need to see the First in the example above to know it's a range.
11196         # They did not use the later range syntaxes.  This routine changes it
11197         # to use the modern syntax.
11198         # $1 is the Input_file object.
11199
11200         my @fields = split /\s*;\s*/;
11201         if ($fields[2] =~ /^<.*, First>/) {
11202             $first_range = $fields[0];
11203             $_ = "";
11204         }
11205         elsif ($fields[2] =~ /^<.*, Last>/) {
11206             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
11207         }
11208         else {
11209             undef $first_range;
11210             $_ = "$fields[0]; $fields[1]";
11211         }
11212
11213         return;
11214     }
11215 }
11216
11217 sub filter_old_style_arabic_shaping {
11218     # Early versions used a different term for the later one.
11219
11220     my @fields = split /\s*;\s*/;
11221     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
11222     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
11223     $_ = join ';', @fields;
11224     return;
11225 }
11226
11227 sub filter_arabic_shaping_line {
11228     # ArabicShaping.txt has entries that look like:
11229     # 062A; TEH; D; BEH
11230     # The field containing 'TEH' is not used.  The next field is Joining_Type
11231     # and the last is Joining_Group
11232     # This generates two lines to pass on, one for each property on the input
11233     # line.
11234
11235     my $file = shift;
11236     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11237
11238     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11239
11240     if (@fields > 4) {
11241         $file->carp_bad_line('Extra fields');
11242         $_ = "";
11243         return;
11244     }
11245
11246     $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
11247     $_ = "$fields[0]; Joining_Type; $fields[2]";
11248
11249     return;
11250 }
11251
11252 { # Closure
11253     my $lc; # Table for lowercase mapping
11254     my $tc;
11255     my $uc;
11256     my %special_casing_code_points;
11257
11258     sub setup_special_casing {
11259         # SpecialCasing.txt contains the non-simple case change mappings.  The
11260         # simple ones are in UnicodeData.txt, which should already have been
11261         # read in to the full property data structures, so as to initialize
11262         # these with the simple ones.  Then the SpecialCasing.txt entries
11263         # add or overwrite the ones which have different full mappings.
11264
11265         # This routine sees if the simple mappings are to be output, and if
11266         # so, copies what has already been put into the full mapping tables,
11267         # while they still contain only the simple mappings.
11268
11269         # The reason it is done this way is that the simple mappings are
11270         # probably not going to be output, so it saves work to initialize the
11271         # full tables with the simple mappings, and then overwrite those
11272         # relatively few entries in them that have different full mappings,
11273         # and thus skip the simple mapping tables altogether.
11274
11275         my $file= shift;
11276         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11277
11278         $lc = property_ref('lc');
11279         $tc = property_ref('tc');
11280         $uc = property_ref('uc');
11281
11282         # For each of the case change mappings...
11283         foreach my $full_table ($lc, $tc, $uc) {
11284             my $full_name = $full_table->name;
11285             unless (defined $full_table && ! $full_table->is_empty) {
11286                 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
11287             }
11288
11289             # Create a table in the old-style format and with the original
11290             # file name for backwards compatibility with applications that
11291             # read it directly.  The new tables contain both the simple and
11292             # full maps, and the old are missing simple maps when there is a
11293             # conflicting full one.  Probably it would have been ok to add
11294             # those to the legacy version, as was already done in 5.14 to the
11295             # case folding one, but this was not done, out of an abundance of
11296             # caution.  The tables are set up here before we deal with the
11297             # full maps so that as we handle those, we can override the simple
11298             # maps for them in the legacy table, and merely add them in the
11299             # new-style one.
11300             my $legacy = Property->new("Legacy_" . $full_table->full_name,
11301                                         File => $full_table->full_name =~
11302                                                             s/case_Mapping//r,
11303                                         Range_Size_1 => 1,
11304                                         Format => $HEX_FORMAT,
11305                                         Default_Map => $CODE_POINT,
11306                                         UCD => 0,
11307                                         Initialize => $full_table,
11308                                         To_Output_Map => $EXTERNAL_MAP,
11309             );
11310
11311             $full_table->add_comment(join_lines( <<END
11312 This file includes both the simple and full case changing maps.  The simple
11313 ones are in the main body of the table below, and the full ones adding to or
11314 overriding them are in the hash.
11315 END
11316             ));
11317
11318             # The simple version's name in each mapping merely has an 's' in
11319             # front of the full one's
11320             my $simple_name = 's' . $full_name;
11321             my $simple = property_ref($simple_name);
11322             $simple->initialize($full_table) if $simple->to_output_map();
11323         }
11324
11325         return;
11326     }
11327
11328     sub filter_2_1_8_special_casing_line {
11329
11330         # This version had duplicate entries in this file.  Delete all but the
11331         # first one
11332         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11333                                               # fields
11334         if (exists $special_casing_code_points{$fields[0]}) {
11335             $_ = "";
11336             return;
11337         }
11338
11339         $special_casing_code_points{$fields[0]} = 1;
11340         filter_special_casing_line(@_);
11341     }
11342
11343     sub filter_special_casing_line {
11344         # Change the format of $_ from SpecialCasing.txt into something that
11345         # the generic handler understands.  Each input line contains three
11346         # case mappings.  This will generate three lines to pass to the
11347         # generic handler for each of those.
11348
11349         # The input syntax (after stripping comments and trailing white space
11350         # is like one of the following (with the final two being entries that
11351         # we ignore):
11352         # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
11353         # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
11354         # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
11355         # Note the trailing semi-colon, unlike many of the input files.  That
11356         # means that there will be an extra null field generated by the split
11357
11358         my $file = shift;
11359         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11360
11361         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11362                                               # fields
11363
11364         # field #4 is when this mapping is conditional.  If any of these get
11365         # implemented, it would be by hard-coding in the casing functions in
11366         # the Perl core, not through tables.  But if there is a new condition
11367         # we don't know about, output a warning.  We know about all the
11368         # conditions through 6.0
11369         if ($fields[4] ne "") {
11370             my @conditions = split ' ', $fields[4];
11371             if ($conditions[0] ne 'tr'  # We know that these languages have
11372                                         # conditions, and some are multiple
11373                 && $conditions[0] ne 'az'
11374                 && $conditions[0] ne 'lt'
11375
11376                 # And, we know about a single condition Final_Sigma, but
11377                 # nothing else.
11378                 && ($v_version gt v5.2.0
11379                     && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
11380             {
11381                 $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
11382             }
11383             elsif ($conditions[0] ne 'Final_Sigma') {
11384
11385                     # Don't print out a message for Final_Sigma, because we
11386                     # have hard-coded handling for it.  (But the standard
11387                     # could change what the rule should be, but it wouldn't
11388                     # show up here anyway.
11389
11390                     print "# SKIPPING Special Casing: $_\n"
11391                                                     if $verbosity >= $VERBOSE;
11392             }
11393             $_ = "";
11394             return;
11395         }
11396         elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
11397             $file->carp_bad_line('Extra fields');
11398             $_ = "";
11399             return;
11400         }
11401
11402         my $decimal_code_point = hex $fields[0];
11403
11404         # Loop to handle each of the three mappings in the input line, in
11405         # order, with $i indicating the current field number.
11406         my $i = 0;
11407         for my $object ($lc, $tc, $uc) {
11408             $i++;   # First time through, $i = 0 ... 3rd time = 3
11409
11410             my $value = $object->value_of($decimal_code_point);
11411             $value = ($value eq $CODE_POINT)
11412                       ? $decimal_code_point
11413                       : hex $value;
11414
11415             # If this isn't a multi-character mapping, it should already have
11416             # been read in.
11417             if ($fields[$i] !~ / /) {
11418                 if ($value != hex $fields[$i]) {
11419                     Carp::my_carp("Bad news. UnicodeData.txt thinks "
11420                                   . $object->name
11421                                   . "(0x$fields[0]) is $value"
11422                                   . " and SpecialCasing.txt thinks it is "
11423                                   . hex($fields[$i])
11424                                   . ".  Good luck.  Retaining UnicodeData value, and proceeding anyway.");
11425                 }
11426             }
11427             else {
11428
11429                 # The mapping goes into both the legacy table, in which it
11430                 # replaces the simple one...
11431                 $file->insert_adjusted_lines("$fields[0]; Legacy_"
11432                                              . $object->full_name
11433                                              . "; $fields[$i]");
11434
11435                 # ... and, the The regular table, in which it is additional,
11436                 # beyond the simple mapping.
11437                 $file->insert_adjusted_lines("$fields[0]; "
11438                                              . $object->name
11439                                             . "; "
11440                                             . $CMD_DELIM
11441                                             . "$REPLACE_CMD=$MULTIPLE_BEFORE"
11442                                             . $CMD_DELIM
11443                                             . $fields[$i]);
11444             }
11445         }
11446
11447         # Everything has been handled by the insert_adjusted_lines()
11448         $_ = "";
11449
11450         return;
11451     }
11452 }
11453
11454 sub filter_old_style_case_folding {
11455     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
11456     # and later style.  Different letters were used in the earlier.
11457
11458     my $file = shift;
11459     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11460
11461     my @fields = split /\s*;\s*/;
11462     if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
11463         $fields[1] = 'I';
11464     }
11465     elsif ($fields[1] eq 'L') {
11466         $fields[1] = 'C';             # L => C always
11467     }
11468     elsif ($fields[1] eq 'E') {
11469         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
11470             $fields[1] = 'F'
11471         }
11472         else {
11473             $fields[1] = 'C'
11474         }
11475     }
11476     else {
11477         $file->carp_bad_line("Expecting L or E in second field");
11478         $_ = "";
11479         return;
11480     }
11481     $_ = join("; ", @fields) . ';';
11482     return;
11483 }
11484
11485 { # Closure for case folding
11486
11487     # Create the map for simple only if are going to output it, for otherwise
11488     # it takes no part in anything we do.
11489     my $to_output_simple;
11490     my $all_folds;
11491
11492     sub setup_case_folding($) {
11493         # Read in the case foldings in CaseFolding.txt.  This handles both
11494         # simple and full case folding.
11495
11496         $to_output_simple
11497                         = property_ref('Simple_Case_Folding')->to_output_map;
11498
11499         if (! $to_output_simple) {
11500             property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
11501         }
11502
11503         $all_folds = $perl->add_match_table("_Perl_Any_Folds",
11504                            Perl_Extension => 1,
11505                            Fate => $INTERNAL_ONLY,
11506                            Description => "Code points that particpate in some fold",
11507                            );
11508
11509         # If we ever wanted to show that these tables were combined, a new
11510         # property method could be created, like set_combined_props()
11511         property_ref('Case_Folding')->add_comment(join_lines( <<END
11512 This file includes both the simple and full case folding maps.  The simple
11513 ones are in the main body of the table below, and the full ones adding to or
11514 overriding them are in the hash.
11515 END
11516         ));
11517         return;
11518     }
11519
11520     sub filter_case_folding_line {
11521         # Called for each line in CaseFolding.txt
11522         # Input lines look like:
11523         # 0041; C; 0061; # LATIN CAPITAL LETTER A
11524         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
11525         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
11526         #
11527         # 'C' means that folding is the same for both simple and full
11528         # 'F' that it is only for full folding
11529         # 'S' that it is only for simple folding
11530         # 'T' is locale-dependent, and ignored
11531         # 'I' is a type of 'F' used in some early releases.
11532         # Note the trailing semi-colon, unlike many of the input files.  That
11533         # means that there will be an extra null field generated by the split
11534         # below, which we ignore and hence is not an error.
11535
11536         my $file = shift;
11537         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11538
11539         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
11540         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
11541             $file->carp_bad_line('Extra fields');
11542             $_ = "";
11543             return;
11544         }
11545
11546         if ($type =~ / ^ [IT] $/x) {   # Skip Turkic case folding, is locale dependent
11547             $_ = "";
11548             return;
11549         }
11550
11551         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
11552         # I are all full foldings; S is single-char.  For S, there is always
11553         # an F entry, so we must allow multiple values for the same code
11554         # point.  Fortunately this table doesn't need further manipulation
11555         # which would preclude using multiple-values.  The S is now included
11556         # so that _swash_inversion_hash() is able to construct closures
11557         # without having to worry about F mappings.
11558         if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
11559             my $from = hex $range;  # Assumes range is single
11560             $all_folds->add_range($from, $from);
11561             $_ = "$range; Case_Folding; "
11562                  . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
11563
11564             if ($type eq 'F') {
11565                 my @string = split " ", $map;
11566                 for my $i (0 .. @string  - 1 -1) {
11567                     my $decimal = hex $string[$i];
11568                     $all_folds->add_range($decimal, $decimal);
11569                 }
11570             }
11571             else {
11572                 $all_folds->add_range(hex $map, hex $map);
11573             }
11574         }
11575         else {
11576             $_ = "";
11577             $file->carp_bad_line('Expecting C F I S or T in second field');
11578         }
11579
11580         # C and S are simple foldings, but simple case folding is not needed
11581         # unless we explicitly want its map table output.
11582         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
11583             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
11584         }
11585
11586         return;
11587     }
11588
11589 } # End case fold closure
11590
11591 sub filter_jamo_line {
11592     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
11593     # from this file that is used in generating the Name property for Jamo
11594     # code points.  But, it also is used to convert early versions' syntax
11595     # into the modern form.  Here are two examples:
11596     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
11597     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
11598     #
11599     # The input is $_, the output is $_ filtered.
11600
11601     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
11602
11603     # Let the caller handle unexpected input.  In earlier versions, there was
11604     # a third field which is supposed to be a comment, but did not have a '#'
11605     # before it.
11606     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
11607
11608     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
11609                                 # beginning.
11610
11611     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
11612     $fields[1] = 'R' if $fields[0] eq '1105';
11613
11614     # Add to structure so can generate Names from it.
11615     my $cp = hex $fields[0];
11616     my $short_name = $fields[1];
11617     $Jamo{$cp} = $short_name;
11618     if ($cp <= $LBase + $LCount) {
11619         $Jamo_L{$short_name} = $cp - $LBase;
11620     }
11621     elsif ($cp <= $VBase + $VCount) {
11622         $Jamo_V{$short_name} = $cp - $VBase;
11623     }
11624     elsif ($cp <= $TBase + $TCount) {
11625         $Jamo_T{$short_name} = $cp - $TBase;
11626     }
11627     else {
11628         Carp::my_carp_bug("Unexpected Jamo code point in $_");
11629     }
11630
11631
11632     # Reassemble using just the first two fields to look like a typical
11633     # property file line
11634     $_ = "$fields[0]; $fields[1]";
11635
11636     return;
11637 }
11638
11639 sub register_fraction($) {
11640     # This registers the input rational number so that it can be passed on to
11641     # utf8_heavy.pl, both in rational and floating forms.
11642
11643     my $rational = shift;
11644
11645     my $float = eval $rational;
11646     $nv_floating_to_rational{$float} = $rational;
11647     return;
11648 }
11649
11650 sub filter_numeric_value_line {
11651     # DNumValues contains lines of a different syntax than the typical
11652     # property file:
11653     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
11654     #
11655     # This routine transforms $_ containing the anomalous syntax to the
11656     # typical, by filtering out the extra columns, and convert early version
11657     # decimal numbers to strings that look like rational numbers.
11658
11659     my $file = shift;
11660     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11661
11662     # Starting in 5.1, there is a rational field.  Just use that, omitting the
11663     # extra columns.  Otherwise convert the decimal number in the second field
11664     # to a rational, and omit extraneous columns.
11665     my @fields = split /\s*;\s*/, $_, -1;
11666     my $rational;
11667
11668     if ($v_version ge v5.1.0) {
11669         if (@fields != 4) {
11670             $file->carp_bad_line('Not 4 semi-colon separated fields');
11671             $_ = "";
11672             return;
11673         }
11674         $rational = $fields[3];
11675         $_ = join '; ', @fields[ 0, 3 ];
11676     }
11677     else {
11678
11679         # Here, is an older Unicode file, which has decimal numbers instead of
11680         # rationals in it.  Use the fraction to calculate the denominator and
11681         # convert to rational.
11682
11683         if (@fields != 2 && @fields != 3) {
11684             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
11685             $_ = "";
11686             return;
11687         }
11688
11689         my $codepoints = $fields[0];
11690         my $decimal = $fields[1];
11691         if ($decimal =~ s/\.0+$//) {
11692
11693             # Anything ending with a decimal followed by nothing but 0's is an
11694             # integer
11695             $_ = "$codepoints; $decimal";
11696             $rational = $decimal;
11697         }
11698         else {
11699
11700             my $denominator;
11701             if ($decimal =~ /\.50*$/) {
11702                 $denominator = 2;
11703             }
11704
11705             # Here have the hardcoded repeating decimals in the fraction, and
11706             # the denominator they imply.  There were only a few denominators
11707             # in the older Unicode versions of this file which this code
11708             # handles, so it is easy to convert them.
11709
11710             # The 4 is because of a round-off error in the Unicode 3.2 files
11711             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11712                 $denominator = 3;
11713             }
11714             elsif ($decimal =~ /\.[27]50*$/) {
11715                 $denominator = 4;
11716             }
11717             elsif ($decimal =~ /\.[2468]0*$/) {
11718                 $denominator = 5;
11719             }
11720             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11721                 $denominator = 6;
11722             }
11723             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11724                 $denominator = 8;
11725             }
11726             if ($denominator) {
11727                 my $sign = ($decimal < 0) ? "-" : "";
11728                 my $numerator = int((abs($decimal) * $denominator) + .5);
11729                 $rational = "$sign$numerator/$denominator";
11730                 $_ = "$codepoints; $rational";
11731             }
11732             else {
11733                 $file->carp_bad_line("Can't cope with number '$decimal'.");
11734                 $_ = "";
11735                 return;
11736             }
11737         }
11738     }
11739
11740     register_fraction($rational) if $rational =~ qr{/};
11741     return;
11742 }
11743
11744 { # Closure
11745     my %unihan_properties;
11746
11747     sub setup_unihan {
11748         # Do any special setup for Unihan properties.
11749
11750         # This property gives the wrong computed type, so override.
11751         my $usource = property_ref('kIRG_USource');
11752         $usource->set_type($STRING) if defined $usource;
11753
11754         # This property is to be considered binary (it says so in
11755         # http://www.unicode.org/reports/tr38/)
11756         my $iicore = property_ref('kIICore');
11757         if (defined $iicore) {
11758             $iicore->set_type($FORCED_BINARY);
11759             $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
11760
11761             # Unicode doesn't include the maps for this property, so don't
11762             # warn that they are missing.
11763             $iicore->set_pre_declared_maps(0);
11764             $iicore->add_comment(join_lines( <<END
11765 This property contains enum values, but Unicode UAX #38 says it should be
11766 interpreted as binary, so Perl creates tables for both 1) its enum values,
11767 plus 2) true/false tables in which it is considered true for all code points
11768 that have a non-null value
11769 END
11770             ));
11771         }
11772
11773         return;
11774     }
11775
11776     sub filter_unihan_line {
11777         # Change unihan db lines to look like the others in the db.  Here is
11778         # an input sample:
11779         #   U+341C        kCangjie        IEKN
11780
11781         # Tabs are used instead of semi-colons to separate fields; therefore
11782         # they may have semi-colons embedded in them.  Change these to periods
11783         # so won't screw up the rest of the code.
11784         s/;/./g;
11785
11786         # Remove lines that don't look like ones we accept.
11787         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11788             $_ = "";
11789             return;
11790         }
11791
11792         # Extract the property, and save a reference to its object.
11793         my $property = $1;
11794         if (! exists $unihan_properties{$property}) {
11795             $unihan_properties{$property} = property_ref($property);
11796         }
11797
11798         # Don't do anything unless the property is one we're handling, which
11799         # we determine by seeing if there is an object defined for it or not
11800         if (! defined $unihan_properties{$property}) {
11801             $_ = "";
11802             return;
11803         }
11804
11805         # Convert the tab separators to our standard semi-colons, and convert
11806         # the U+HHHH notation to the rest of the standard's HHHH
11807         s/\t/;/g;
11808         s/\b U \+ (?= $code_point_re )//xg;
11809
11810         #local $to_trace = 1 if main::DEBUG;
11811         trace $_ if main::DEBUG && $to_trace;
11812
11813         return;
11814     }
11815 }
11816
11817 sub filter_blocks_lines {
11818     # In the Blocks.txt file, the names of the blocks don't quite match the
11819     # names given in PropertyValueAliases.txt, so this changes them so they
11820     # do match:  Blanks and hyphens are changed into underscores.  Also makes
11821     # early release versions look like later ones
11822     #
11823     # $_ is transformed to the correct value.
11824
11825     my $file = shift;
11826         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11827
11828     if ($v_version lt v3.2.0) {
11829         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11830             $_ = "";
11831             return;
11832         }
11833
11834         # Old versions used a different syntax to mark the range.
11835         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11836     }
11837
11838     my @fields = split /\s*;\s*/, $_, -1;
11839     if (@fields != 2) {
11840         $file->carp_bad_line("Expecting exactly two fields");
11841         $_ = "";
11842         return;
11843     }
11844
11845     # Change hyphens and blanks in the block name field only
11846     $fields[1] =~ s/[ -]/_/g;
11847     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
11848
11849     $_ = join("; ", @fields);
11850     return;
11851 }
11852
11853 { # Closure
11854     my $current_property;
11855
11856     sub filter_old_style_proplist {
11857         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
11858         # was in a completely different syntax.  Ken Whistler of Unicode says
11859         # that it was something he used as an aid for his own purposes, but
11860         # was never an official part of the standard.  Many of the properties
11861         # in it were incorporated into the later PropList.txt, but some were
11862         # not.  This program uses this early file to generate property tables
11863         # that are otherwise not accessible in the early UCD's.  It does this
11864         # for the ones that eventually became official, and don't appear to be
11865         # too different in their contents from the later official version, and
11866         # throws away the rest.  It could be argued that the ones it generates
11867         # were probably not really official at that time, so should be
11868         # ignored.  You can easily modify things to skip all of them by
11869         # changing this function to just set $_ to "", and return; and to skip
11870         # certain of them by by simply removing their declarations from
11871         # get_old_property_aliases().
11872         #
11873         # Here is a list of all the ones that are thrown away:
11874         #   Alphabetic                   The definitions for this are very
11875         #                                defective, so better to not mislead
11876         #                                people into thinking it works.
11877         #                                Instead the Perl extension of the
11878         #                                same name is constructed from first
11879         #                                principles.
11880         #   Bidi=*                       duplicates UnicodeData.txt
11881         #   Combining                    never made into official property;
11882         #                                is \P{ccc=0}
11883         #   Composite                    never made into official property.
11884         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
11885         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
11886         #   Delimiter                    never made into official property;
11887         #                                removed in 3.0.1
11888         #   Format Control               never made into official property;
11889         #                                similar to gc=cf
11890         #   High Surrogate               duplicates Blocks.txt
11891         #   Ignorable Control            never made into official property;
11892         #                                similar to di=y
11893         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
11894         #   Left of Pair                 never made into official property;
11895         #   Line Separator               duplicates UnicodeData.txt: gc=zl
11896         #   Low Surrogate                duplicates Blocks.txt
11897         #   Non-break                    was actually listed as a property
11898         #                                in 3.2, but without any code
11899         #                                points.  Unicode denies that this
11900         #                                was ever an official property
11901         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
11902         #   Numeric                      duplicates UnicodeData.txt: gc=cc
11903         #   Paired Punctuation           never made into official property;
11904         #                                appears to be gc=ps + gc=pe
11905         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
11906         #   Private Use                  duplicates UnicodeData.txt: gc=co
11907         #   Private Use High Surrogate   duplicates Blocks.txt
11908         #   Punctuation                  duplicates UnicodeData.txt: gc=p
11909         #   Space                        different definition than eventual
11910         #                                one.
11911         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
11912         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cn
11913         #   Zero-width                   never made into official property;
11914         #                                subset of gc=cf
11915         # Most of the properties have the same names in this file as in later
11916         # versions, but a couple do not.
11917         #
11918         # This subroutine filters $_, converting it from the old style into
11919         # the new style.  Here's a sample of the old-style
11920         #
11921         #   *******************************************
11922         #
11923         #   Property dump for: 0x100000A0 (Join Control)
11924         #
11925         #   200C..200D  (2 chars)
11926         #
11927         # In the example, the property is "Join Control".  It is kept in this
11928         # closure between calls to the subroutine.  The numbers beginning with
11929         # 0x were internal to Ken's program that generated this file.
11930
11931         # If this line contains the property name, extract it.
11932         if (/^Property dump for: [^(]*\((.*)\)/) {
11933             $_ = $1;
11934
11935             # Convert white space to underscores.
11936             s/ /_/g;
11937
11938             # Convert the few properties that don't have the same name as
11939             # their modern counterparts
11940             s/Identifier_Part/ID_Continue/
11941             or s/Not_a_Character/NChar/;
11942
11943             # If the name matches an existing property, use it.
11944             if (defined property_ref($_)) {
11945                 trace "new property=", $_ if main::DEBUG && $to_trace;
11946                 $current_property = $_;
11947             }
11948             else {        # Otherwise discard it
11949                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11950                 undef $current_property;
11951             }
11952             $_ = "";    # The property is saved for the next lines of the
11953                         # file, but this defining line is of no further use,
11954                         # so clear it so that the caller won't process it
11955                         # further.
11956         }
11957         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11958
11959             # Here, the input line isn't a header defining a property for the
11960             # following section, and either we aren't in such a section, or
11961             # the line doesn't look like one that defines the code points in
11962             # such a section.  Ignore this line.
11963             $_ = "";
11964         }
11965         else {
11966
11967             # Here, we have a line defining the code points for the current
11968             # stashed property.  Anything starting with the first blank is
11969             # extraneous.  Otherwise, it should look like a normal range to
11970             # the caller.  Append the property name so that it looks just like
11971             # a modern PropList entry.
11972
11973             $_ =~ s/\s.*//;
11974             $_ .= "; $current_property";
11975         }
11976         trace $_ if main::DEBUG && $to_trace;
11977         return;
11978     }
11979 } # End closure for old style proplist
11980
11981 sub filter_old_style_normalization_lines {
11982     # For early releases of Unicode, the lines were like:
11983     #        74..2A76    ; NFKD_NO
11984     # For later releases this became:
11985     #        74..2A76    ; NFKD_QC; N
11986     # Filter $_ to look like those in later releases.
11987     # Similarly for MAYBEs
11988
11989     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11990
11991     # Also, the property FC_NFKC was abbreviated to FNC
11992     s/FNC/FC_NFKC/;
11993     return;
11994 }
11995
11996 sub setup_script_extensions {
11997     # The Script_Extensions property starts out with a clone of the Script
11998     # property.
11999
12000     my $scx = property_ref("Script_Extensions");
12001     $scx = Property->new("scx", Full_Name => "Script_Extensions")
12002                                                             if ! defined $scx;
12003     $scx->_set_format($STRING_WHITE_SPACE_LIST);
12004     $scx->initialize($script);
12005     $scx->set_default_map($script->default_map);
12006     $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
12007     $scx->add_comment(join_lines( <<END
12008 The values for code points that appear in one script are just the same as for
12009 the 'Script' property.  Likewise the values for those that appear in many
12010 scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
12011 values of code points that appear in a few scripts are a space separated list
12012 of those scripts.
12013 END
12014     ));
12015
12016     # Initialize scx's tables and the aliases for them to be the same as sc's
12017     foreach my $table ($script->tables) {
12018         my $scx_table = $scx->add_match_table($table->name,
12019                                 Full_Name => $table->full_name);
12020         foreach my $alias ($table->aliases) {
12021             $scx_table->add_alias($alias->name);
12022         }
12023     }
12024 }
12025
12026 sub  filter_script_extensions_line {
12027     # The Scripts file comes with the full name for the scripts; the
12028     # ScriptExtensions, with the short name.  The final mapping file is a
12029     # combination of these, and without adjustment, would have inconsistent
12030     # entries.  This filters the latter file to convert to full names.
12031     # Entries look like this:
12032     # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
12033
12034     my @fields = split /\s*;\s*/;
12035
12036     # This script was erroneously omitted in this Unicode version.
12037     $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
12038
12039     my @full_names;
12040     foreach my $short_name (split " ", $fields[1]) {
12041         push @full_names, $script->table($short_name)->full_name;
12042     }
12043     $fields[1] = join " ", @full_names;
12044     $_ = join "; ", @fields;
12045
12046     return;
12047 }
12048
12049 sub generate_hst {
12050
12051     # Populates the Hangul Syllable Type property from first principles
12052
12053     my $file= shift;
12054     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12055
12056     # These few ranges are hard-coded in.
12057     $file->insert_lines(split /\n/, <<'END'
12058 1100..1159    ; L
12059 115F          ; L
12060 1160..11A2    ; V
12061 11A8..11F9    ; T
12062 END
12063 );
12064
12065     # The Hangul syllables in version 1 are completely different than what came
12066     # after, so just ignore them there.
12067     if ($v_version lt v2.0.0) {
12068         my $property = property_ref($file->property);
12069         push @tables_that_may_be_empty, $property->table('LV')->complete_name;
12070         push @tables_that_may_be_empty, $property->table('LVT')->complete_name;
12071         return;
12072     }
12073
12074     # The algorithmically derived syllables are almost all LVT ones, so
12075     # initialize the whole range with that.
12076     $file->insert_lines(sprintf "%04X..%04X; LVT\n",
12077                         $SBase, $SBase + $SCount -1);
12078
12079     # Those ones that aren't LVT are LV, and they occur at intervals of
12080     # $TCount code points, starting with the first code point, at $SBase.
12081     for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
12082         $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
12083     }
12084
12085     return;
12086 }
12087
12088 sub generate_GCB {
12089
12090     # Populates the Grapheme Cluster Break property from first principles
12091
12092     my $file= shift;
12093     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12094
12095     # All these definitions are from
12096     # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
12097     # from http://www.unicode.org/reports/tr29/tr29-4.html
12098
12099     foreach my $range ($gc->ranges) {
12100
12101         # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
12102         # and gc=Cf
12103         if ($range->value =~ / ^ M [en] $ /x) {
12104             $file->insert_lines(sprintf "%04X..%04X; Extend",
12105                                 $range->start,  $range->end);
12106         }
12107         elsif ($range->value =~ / ^ C [cf] $ /x) {
12108             $file->insert_lines(sprintf "%04X..%04X; Control",
12109                                 $range->start,  $range->end);
12110         }
12111     }
12112     $file->insert_lines("2028; Control"); # Line Separator
12113     $file->insert_lines("2029; Control"); # Paragraph Separator
12114
12115     $file->insert_lines("000D; CR");
12116     $file->insert_lines("000A; LF");
12117
12118     # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
12119     foreach my $code_point ( qw{
12120                                 40000
12121                                 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
12122                                 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
12123                                 }
12124     ) {
12125         my $category = $gc->value_of(hex $code_point);
12126         next if ! defined $category || $category eq 'Cn'; # But not if
12127                                                           # unassigned in this
12128                                                           # release
12129         $file->insert_lines("$code_point; Extend");
12130     }
12131
12132     my $hst = property_ref('Hangul_Syllable_Type');
12133     if ($hst->count > 0) {
12134         foreach my $range ($hst->ranges) {
12135             $file->insert_lines(sprintf "%04X..%04X; %s",
12136                                     $range->start, $range->end, $range->value);
12137         }
12138     }
12139     else {
12140         generate_hst($file);
12141     }
12142
12143     return;
12144 }
12145
12146 sub setup_early_name_alias {
12147     my $file= shift;
12148     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12149
12150     # This has the effect of pretending that the Name_Alias property was
12151     # available in all Unicode releases.  Strictly speaking, this property
12152     # should not be availabe in early releases, but doing this allows
12153     # charnames.pm to work on older releases without change.  Prior to v5.16
12154     # it had these names hard-coded inside it.  Unicode 6.1 came along and
12155     # created these names, and so they were removed from charnames.
12156
12157     my $aliases = property_ref('Name_Alias');
12158     if (! defined $aliases) {
12159         $aliases = Property->new('Name_Alias', Default_Map => "");
12160     }
12161
12162     $file->insert_lines(get_old_name_aliases());
12163
12164     return;
12165 }
12166
12167 sub get_old_name_aliases () {
12168
12169     # The Unicode_1_Name field, contains most of these names.  One would
12170     # expect, given the field's name, that its values would be fixed across
12171     # versions, giving the true Unicode version 1 name for the character.
12172     # Sadly, this is not the case.  Actually Version 1.1.5 had no names for
12173     # any of the controls; Version 2.0 introduced names for the C0 controls,
12174     # and 3.0 introduced C1 names.  3.0.1 removed the name INDEX; and 3.2
12175     # changed some names: it
12176     #   changed to parenthesized versions like "NEXT LINE" to
12177     #       "NEXT LINE (NEL)";
12178     #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
12179     #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
12180     #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
12181     # This list contains all the names that were defined so that
12182     # charnames::vianame(), etc. understand them all EVEN if this version of
12183     # Unicode didn't specify them (this could be construed as a bug).
12184     # mktables elsewhere gives preference to the Unicode_1_Name field over
12185     # these names, so that viacode() will return the correct value for that
12186     # version of Unicode, except when that version doesn't define a name,
12187     # viacode() will return one anyway (this also could be construed as a
12188     # bug).  But these potential "bugs" allow for the smooth working of code
12189     # on earlier Unicode releases.
12190
12191     my @return = split /\n/, <<'END';
12192 0000;NULL;control
12193 0000;NUL;abbreviation
12194 0001;START OF HEADING;control
12195 0001;SOH;abbreviation
12196 0002;START OF TEXT;control
12197 0002;STX;abbreviation
12198 0003;END OF TEXT;control
12199 0003;ETX;abbreviation
12200 0004;END OF TRANSMISSION;control
12201 0004;EOT;abbreviation
12202 0005;ENQUIRY;control
12203 0005;ENQ;abbreviation
12204 0006;ACKNOWLEDGE;control
12205 0006;ACK;abbreviation
12206 0007;BELL;control
12207 0007;BEL;abbreviation
12208 0008;BACKSPACE;control
12209 0008;BS;abbreviation
12210 0009;CHARACTER TABULATION;control
12211 0009;HORIZONTAL TABULATION;control
12212 0009;HT;abbreviation
12213 0009;TAB;abbreviation
12214 000A;LINE FEED;control
12215 000A;LINE FEED (LF);control
12216 000A;NEW LINE;control
12217 000A;END OF LINE;control
12218 000A;LF;abbreviation
12219 000A;NL;abbreviation
12220 000A;EOL;abbreviation
12221 000B;LINE TABULATION;control
12222 000B;VERTICAL TABULATION;control
12223 000B;VT;abbreviation
12224 000C;FORM FEED;control
12225 000C;FORM FEED (FF);control
12226 000C;FF;abbreviation
12227 000D;CARRIAGE RETURN;control
12228 000D;CARRIAGE RETURN (CR);control
12229 000D;CR;abbreviation
12230 000E;SHIFT OUT;control
12231 000E;LOCKING-SHIFT ONE;control
12232 000E;SO;abbreviation
12233 000F;SHIFT IN;control
12234 000F;LOCKING-SHIFT ZERO;control
12235 000F;SI;abbreviation
12236 0010;DATA LINK ESCAPE;control
12237 0010;DLE;abbreviation
12238 0011;DEVICE CONTROL ONE;control
12239 0011;DC1;abbreviation
12240 0012;DEVICE CONTROL TWO;control
12241 0012;DC2;abbreviation
12242 0013;DEVICE CONTROL THREE;control
12243 0013;DC3;abbreviation
12244 0014;DEVICE CONTROL FOUR;control
12245 0014;DC4;abbreviation
12246 0015;NEGATIVE ACKNOWLEDGE;control
12247 0015;NAK;abbreviation
12248 0016;SYNCHRONOUS IDLE;control
12249 0016;SYN;abbreviation
12250 0017;END OF TRANSMISSION BLOCK;control
12251 0017;ETB;abbreviation
12252 0018;CANCEL;control
12253 0018;CAN;abbreviation
12254 0019;END OF MEDIUM;control
12255 0019;EOM;abbreviation
12256 001A;SUBSTITUTE;control
12257 001A;SUB;abbreviation
12258 001B;ESCAPE;control
12259 001B;ESC;abbreviation
12260 001C;INFORMATION SEPARATOR FOUR;control
12261 001C;FILE SEPARATOR;control
12262 001C;FS;abbreviation
12263 001D;INFORMATION SEPARATOR THREE;control
12264 001D;GROUP SEPARATOR;control
12265 001D;GS;abbreviation
12266 001E;INFORMATION SEPARATOR TWO;control
12267 001E;RECORD SEPARATOR;control
12268 001E;RS;abbreviation
12269 001F;INFORMATION SEPARATOR ONE;control
12270 001F;UNIT SEPARATOR;control
12271 001F;US;abbreviation
12272 0020;SP;abbreviation
12273 007F;DELETE;control
12274 007F;DEL;abbreviation
12275 0080;PADDING CHARACTER;figment
12276 0080;PAD;abbreviation
12277 0081;HIGH OCTET PRESET;figment
12278 0081;HOP;abbreviation
12279 0082;BREAK PERMITTED HERE;control
12280 0082;BPH;abbreviation
12281 0083;NO BREAK HERE;control
12282 0083;NBH;abbreviation
12283 0084;INDEX;control
12284 0084;IND;abbreviation
12285 0085;NEXT LINE;control
12286 0085;NEXT LINE (NEL);control
12287 0085;NEL;abbreviation
12288 0086;START OF SELECTED AREA;control
12289 0086;SSA;abbreviation
12290 0087;END OF SELECTED AREA;control
12291 0087;ESA;abbreviation
12292 0088;CHARACTER TABULATION SET;control
12293 0088;HORIZONTAL TABULATION SET;control
12294 0088;HTS;abbreviation
12295 0089;CHARACTER TABULATION WITH JUSTIFICATION;control
12296 0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
12297 0089;HTJ;abbreviation
12298 008A;LINE TABULATION SET;control
12299 008A;VERTICAL TABULATION SET;control
12300 008A;VTS;abbreviation
12301 008B;PARTIAL LINE FORWARD;control
12302 008B;PARTIAL LINE DOWN;control
12303 008B;PLD;abbreviation
12304 008C;PARTIAL LINE BACKWARD;control
12305 008C;PARTIAL LINE UP;control
12306 008C;PLU;abbreviation
12307 008D;REVERSE LINE FEED;control
12308 008D;REVERSE INDEX;control
12309 008D;RI;abbreviation
12310 008E;SINGLE SHIFT TWO;control
12311 008E;SINGLE-SHIFT-2;control
12312 008E;SS2;abbreviation
12313 008F;SINGLE SHIFT THREE;control
12314 008F;SINGLE-SHIFT-3;control
12315 008F;SS3;abbreviation
12316 0090;DEVICE CONTROL STRING;control
12317 0090;DCS;abbreviation
12318 0091;PRIVATE USE ONE;control
12319 0091;PRIVATE USE-1;control
12320 0091;PU1;abbreviation
12321 0092;PRIVATE USE TWO;control
12322 0092;PRIVATE USE-2;control
12323 0092;PU2;abbreviation
12324 0093;SET TRANSMIT STATE;control
12325 0093;STS;abbreviation
12326 0094;CANCEL CHARACTER;control
12327 0094;CCH;abbreviation
12328 0095;MESSAGE WAITING;control
12329 0095;MW;abbreviation
12330 0096;START OF GUARDED AREA;control
12331 0096;START OF PROTECTED AREA;control
12332 0096;SPA;abbreviation
12333 0097;END OF GUARDED AREA;control
12334 0097;END OF PROTECTED AREA;control
12335 0097;EPA;abbreviation
12336 0098;START OF STRING;control
12337 0098;SOS;abbreviation
12338 0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
12339 0099;SGC;abbreviation
12340 009A;SINGLE CHARACTER INTRODUCER;control
12341 009A;SCI;abbreviation
12342 009B;CONTROL SEQUENCE INTRODUCER;control
12343 009B;CSI;abbreviation
12344 009C;STRING TERMINATOR;control
12345 009C;ST;abbreviation
12346 009D;OPERATING SYSTEM COMMAND;control
12347 009D;OSC;abbreviation
12348 009E;PRIVACY MESSAGE;control
12349 009E;PM;abbreviation
12350 009F;APPLICATION PROGRAM COMMAND;control
12351 009F;APC;abbreviation
12352 00A0;NBSP;abbreviation
12353 00AD;SHY;abbreviation
12354 200B;ZWSP;abbreviation
12355 200C;ZWNJ;abbreviation
12356 200D;ZWJ;abbreviation
12357 200E;LRM;abbreviation
12358 200F;RLM;abbreviation
12359 202A;LRE;abbreviation
12360 202B;RLE;abbreviation
12361 202C;PDF;abbreviation
12362 202D;LRO;abbreviation
12363 202E;RLO;abbreviation
12364 FEFF;BYTE ORDER MARK;alternate
12365 FEFF;BOM;abbreviation
12366 FEFF;ZWNBSP;abbreviation
12367 END
12368
12369     if ($v_version ge v3.0.0) {
12370         push @return, split /\n/, <<'END';
12371 180B; FVS1; abbreviation
12372 180C; FVS2; abbreviation
12373 180D; FVS3; abbreviation
12374 180E; MVS; abbreviation
12375 202F; NNBSP; abbreviation
12376 END
12377     }
12378
12379     if ($v_version ge v3.2.0) {
12380         push @return, split /\n/, <<'END';
12381 034F; CGJ; abbreviation
12382 205F; MMSP; abbreviation
12383 2060; WJ; abbreviation
12384 END
12385         # Add in VS1..VS16
12386         my $cp = 0xFE00 - 1;
12387         for my $i (1..16) {
12388             push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12389         }
12390     }
12391     if ($v_version ge v4.0.0) { # Add in VS17..VS256
12392         my $cp = 0xE0100 - 17;
12393         for my $i (17..256) {
12394             push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12395         }
12396     }
12397
12398     # ALERT did not come along until 6.0, at which point it became preferred
12399     # over BELL, and was never in the Unicode_1_Name field.  For the same
12400     # reasons, that the other names are made known to all releases by this
12401     # function, we make ALERT known too.  By inserting it
12402     # last in early releases, BELL is preferred over it; and vice-vers in 6.0
12403     my $alert = '0007; ALERT; control';
12404     if ($v_version lt v6.0.0) {
12405         push @return, $alert;
12406     }
12407     else {
12408         unshift @return, $alert;
12409     }
12410
12411     return @return;
12412 }
12413
12414 sub filter_later_version_name_alias_line {
12415
12416     # This file has an extra entry per line for the alias type.  This is
12417     # handled by creating a compound entry: "$alias: $type";  First, split
12418     # the line into components.
12419     my ($range, $alias, $type, @remainder)
12420         = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12421
12422     # This file contains multiple entries for some components, so tell the
12423     # downstream code to allow this in our internal tables; the
12424     # $MULTIPLE_AFTER preserves the input ordering.
12425     $_ = join ";", $range, $CMD_DELIM
12426                            . $REPLACE_CMD
12427                            . '='
12428                            . $MULTIPLE_AFTER
12429                            . $CMD_DELIM
12430                            . "$alias: $type",
12431                    @remainder;
12432     return;
12433 }
12434
12435 sub filter_early_version_name_alias_line {
12436
12437     # Early versions did not have the trailing alias type field; implicitly it
12438     # was 'correction'.   But our synthetic lines we add in this program do
12439     # have it, so test for the type field.
12440     $_ .= "; correction" if $_ !~ /;.*;/;
12441
12442     filter_later_version_name_alias_line;
12443     return;
12444 }
12445
12446 sub finish_Unicode() {
12447     # This routine should be called after all the Unicode files have been read
12448     # in.  It:
12449     # 1) Creates properties that are missing from the version of Unicode being
12450     #    compiled, and which, for whatever reason, are needed for the Perl
12451     #    core to function properly.  These are minimally populated as
12452     #    necessary.
12453     # 2) Adds the mappings for code points missing from the files which have
12454     #    defaults specified for them.
12455     # 3) At this this point all mappings are known, so it computes the type of
12456     #    each property whose type hasn't been determined yet.
12457     # 4) Calculates all the regular expression match tables based on the
12458     #    mappings.
12459     # 5) Calculates and adds the tables which are defined by Unicode, but
12460     #    which aren't derived by them, and certain derived tables that Perl
12461     #    uses.
12462
12463     # Folding information was introduced later into Unicode data.  To get
12464     # Perl's case ignore (/i) to work at all in releases that don't have
12465     # folding, use the best available alternative, which is lower casing.
12466     my $fold = property_ref('Case_Folding');
12467     if ($fold->is_empty) {
12468         $fold->initialize(property_ref('Lowercase_Mapping'));
12469         $fold->add_note(join_lines(<<END
12470 WARNING: This table uses lower case as a substitute for missing fold
12471 information
12472 END
12473         ));
12474     }
12475
12476     # Multiple-character mapping was introduced later into Unicode data, so it
12477     # is by default the simple version.  If to output the simple versions and
12478     # not present, just use the regular (which in these Unicode versions is
12479     # the simple as well).
12480     foreach my $map (qw {   Uppercase_Mapping
12481                             Lowercase_Mapping
12482                             Titlecase_Mapping
12483                             Case_Folding
12484                         } )
12485     {
12486         my $simple = property_ref("Simple_$map");
12487         next if ! $simple->is_empty;
12488         if ($simple->to_output_map) {
12489             $simple->initialize(property_ref($map));
12490         }
12491         else {
12492             property_ref($map)->set_proxy_for($simple->name);
12493         }
12494     }
12495
12496     # For each property, fill in any missing mappings, and calculate the re
12497     # match tables.  If a property has more than one missing mapping, the
12498     # default is a reference to a data structure, and requires data from other
12499     # properties to resolve.  The sort is used to cause these to be processed
12500     # last, after all the other properties have been calculated.
12501     # (Fortunately, the missing properties so far don't depend on each other.)
12502     foreach my $property
12503         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
12504         property_ref('*'))
12505     {
12506         # $perl has been defined, but isn't one of the Unicode properties that
12507         # need to be finished up.
12508         next if $property == $perl;
12509
12510         # Nor do we need to do anything with properties that aren't going to
12511         # be output.
12512         next if $property->fate == $SUPPRESSED;
12513
12514         # Handle the properties that have more than one possible default
12515         if (ref $property->default_map) {
12516             my $default_map = $property->default_map;
12517
12518             # These properties have stored in the default_map:
12519             # One or more of:
12520             #   1)  A default map which applies to all code points in a
12521             #       certain class
12522             #   2)  an expression which will evaluate to the list of code
12523             #       points in that class
12524             # And
12525             #   3) the default map which applies to every other missing code
12526             #      point.
12527             #
12528             # Go through each list.
12529             while (my ($default, $eval) = $default_map->get_next_defaults) {
12530
12531                 # Get the class list, and intersect it with all the so-far
12532                 # unspecified code points yielding all the code points
12533                 # in the class that haven't been specified.
12534                 my $list = eval $eval;
12535                 if ($@) {
12536                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
12537                     last;
12538                 }
12539
12540                 # Narrow down the list to just those code points we don't have
12541                 # maps for yet.
12542                 $list = $list & $property->inverse_list;
12543
12544                 # Add mappings to the property for each code point in the list
12545                 foreach my $range ($list->ranges) {
12546                     $property->add_map($range->start, $range->end, $default,
12547                     Replace => $CROAK);
12548                 }
12549             }
12550
12551             # All remaining code points have the other mapping.  Set that up
12552             # so the normal single-default mapping code will work on them
12553             $property->set_default_map($default_map->other_default);
12554
12555             # And fall through to do that
12556         }
12557
12558         # We should have enough data now to compute the type of the property.
12559         $property->compute_type;
12560         my $property_type = $property->type;
12561
12562         next if ! $property->to_create_match_tables;
12563
12564         # Here want to create match tables for this property
12565
12566         # The Unicode db always (so far, and they claim into the future) have
12567         # the default for missing entries in binary properties be 'N' (unless
12568         # there is a '@missing' line that specifies otherwise)
12569         if ($property_type == $BINARY && ! defined $property->default_map) {
12570             $property->set_default_map('N');
12571         }
12572
12573         # Add any remaining code points to the mapping, using the default for
12574         # missing code points.
12575         my $default_table;
12576         if (defined (my $default_map = $property->default_map)) {
12577
12578             # Make sure there is a match table for the default
12579             if (! defined ($default_table = $property->table($default_map))) {
12580                 $default_table = $property->add_match_table($default_map);
12581             }
12582
12583             # And, if the property is binary, the default table will just
12584             # be the complement of the other table.
12585             if ($property_type == $BINARY) {
12586                 my $non_default_table;
12587
12588                 # Find the non-default table.
12589                 for my $table ($property->tables) {
12590                     next if $table == $default_table;
12591                     $non_default_table = $table;
12592                 }
12593                 $default_table->set_complement($non_default_table);
12594             }
12595             else {
12596
12597                 # This fills in any missing values with the default.  It's not
12598                 # necessary to do this with binary properties, as the default
12599                 # is defined completely in terms of the Y table.
12600                 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
12601                                    $default_map, Replace => $NO);
12602             }
12603         }
12604
12605         # Have all we need to populate the match tables.
12606         my $property_name = $property->name;
12607         my $maps_should_be_defined = $property->pre_declared_maps;
12608         foreach my $range ($property->ranges) {
12609             my $map = $range->value;
12610             my $table = $property->table($map);
12611             if (! defined $table) {
12612
12613                 # Integral and rational property values are not necessarily
12614                 # defined in PropValueAliases, but whether all the other ones
12615                 # should be depends on the property.
12616                 if ($maps_should_be_defined
12617                     && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
12618                 {
12619                     Carp::my_carp("Table '$property_name=$map' should have been defined.  Defining it now.")
12620                 }
12621                 $table = $property->add_match_table($map);
12622             }
12623
12624             next if $table->complement != 0;    # Don't need to populate these
12625             $table->add_range($range->start, $range->end);
12626         }
12627
12628         # A forced binary property has additional true/false tables which
12629         # should have been set up when it was forced into binary.  The false
12630         # table matches exactly the same set as the property's default table.
12631         # The true table matches the complement of that.  The false table is
12632         # not the same as an additional set of aliases on top of the default
12633         # table, so use 'set_equivalent_to'.  If it were implemented as
12634         # additional aliases, various things would have to be adjusted, but
12635         # especially, if the user wants to get a list of names for the table
12636         # using Unicode::UCD::prop_value_aliases(), s/he should get a
12637         # different set depending on whether they want the default table or
12638         # the false table.
12639         if ($property_type == $FORCED_BINARY) {
12640             $property->table('N')->set_equivalent_to($default_table,
12641                                                      Related => 1);
12642             $property->table('Y')->set_complement($default_table);
12643         }
12644
12645         # For Perl 5.6 compatibility, all properties matchable in regexes can
12646         # have an optional 'Is_' prefix.  This is now done in utf8_heavy.pl.
12647         # But warn if this creates a conflict with a (new) Unicode property
12648         # name, although it appears that Unicode has made a decision never to
12649         # begin a property name with 'Is_', so this shouldn't happen.
12650         foreach my $alias ($property->aliases) {
12651             my $Is_name = 'Is_' . $alias->name;
12652             if (defined (my $pre_existing = property_ref($Is_name))) {
12653                 Carp::my_carp(<<END
12654 There is already an alias named $Is_name (from " . $pre_existing . "), so
12655 creating one for $property won't work.  This is bad news.  If it is not too
12656 late, get Unicode to back off.  Otherwise go back to the old scheme (findable
12657 from the git blame log for this area of the code that suppressed individual
12658 aliases that conflict with the new Unicode names.  Proceeding anyway.
12659 END
12660                 );
12661             }
12662         } # End of loop through aliases for this property
12663     } # End of loop through all Unicode properties.
12664
12665     # Fill in the mappings that Unicode doesn't completely furnish.  First the
12666     # single letter major general categories.  If Unicode were to start
12667     # delivering the values, this would be redundant, but better that than to
12668     # try to figure out if should skip and not get it right.  Ths could happen
12669     # if a new major category were to be introduced, and the hard-coded test
12670     # wouldn't know about it.
12671     # This routine depends on the standard names for the general categories
12672     # being what it thinks they are, like 'Cn'.  The major categories are the
12673     # union of all the general category tables which have the same first
12674     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
12675     foreach my $minor_table ($gc->tables) {
12676         my $minor_name = $minor_table->name;
12677         next if length $minor_name == 1;
12678         if (length $minor_name != 2) {
12679             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
12680             next;
12681         }
12682
12683         my $major_name = uc(substr($minor_name, 0, 1));
12684         my $major_table = $gc->table($major_name);
12685         $major_table += $minor_table;
12686     }
12687
12688     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
12689     # defines it as LC)
12690     my $LC = $gc->table('LC');
12691     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
12692     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
12693
12694
12695     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
12696                          # deliver the correct values in it
12697         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
12698
12699         # Lt not in release 1.
12700         if (defined $gc->table('Lt')) {
12701             $LC += $gc->table('Lt');
12702             $gc->table('Lt')->set_caseless_equivalent($LC);
12703         }
12704     }
12705     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
12706
12707     $gc->table('Ll')->set_caseless_equivalent($LC);
12708     $gc->table('Lu')->set_caseless_equivalent($LC);
12709
12710     my $Cs = $gc->table('Cs');
12711
12712     # Create digit and case fold tables with the original file names for
12713     # backwards compatibility with applications that read them directly.
12714     my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
12715                               Default_Map => "",
12716                               Perl_Extension => 1,
12717                               File => 'Digit',    # Trad. location
12718                               Directory => $map_directory,
12719                               UCD => 0,
12720                               Type => $STRING,
12721                               To_Output_Map => $EXTERNAL_MAP,
12722                               Range_Size_1 => 1,
12723                               Initialize => property_ref('Perl_Decimal_Digit'),
12724                             );
12725     $Digit->add_comment(join_lines(<<END
12726 This file gives the mapping of all code points which represent a single
12727 decimal digit [0-9] to their respective digits.  For example, the code point
12728 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
12729 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
12730 numerals.
12731 END
12732     ));
12733
12734     Property->new('Legacy_Case_Folding',
12735                     File => "Fold",
12736                     Directory => $map_directory,
12737                     Default_Map => $CODE_POINT,
12738                     UCD => 0,
12739                     Range_Size_1 => 1,
12740                     Type => $STRING,
12741                     To_Output_Map => $EXTERNAL_MAP,
12742                     Format => $HEX_FORMAT,
12743                     Initialize => property_ref('cf'),
12744     );
12745
12746     # The Script_Extensions property started out as a clone of the Script
12747     # property.  But processing its data file caused some elements to be
12748     # replaced with different data.  (These elements were for the Common and
12749     # Inherited properties.)  This data is a qw() list of all the scripts that
12750     # the code points in the given range are in.  An example line is:
12751     # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
12752     #
12753     # The code above has created a new match table named "Arab Syrc Thaa"
12754     # which contains 060C.  (The cloned table started out with this code point
12755     # mapping to "Common".)  Now we add 060C to each of the Arab, Syrc, and
12756     # Thaa match tables.  Then we delete the now spurious "Arab Syrc Thaa"
12757     # match table.  This is repeated for all these tables and ranges.  The map
12758     # data is retained in the map table for reference, but the spurious match
12759     # tables are deleted.
12760
12761     my $scx = property_ref("Script_Extensions");
12762     if (defined $scx) {
12763         foreach my $table ($scx->tables) {
12764             next unless $table->name =~ /\s/;   # All the new and only the new
12765                                                 # tables have a space in their
12766                                                 # names
12767             my @scripts = split /\s+/, $table->name;
12768             foreach my $script (@scripts) {
12769                 my $script_table = $scx->table($script);
12770                 $script_table += $table;
12771             }
12772             $scx->delete_match_table($table);
12773         }
12774     }
12775
12776     return;
12777 }
12778
12779 sub pre_3_dot_1_Nl () {
12780
12781     # Return a range list for gc=nl for Unicode versions prior to 3.1, which
12782     # is when Unicode's became fully usable.  These code points were
12783     # determined by inspection and experimentation.  gc=nl is important for
12784     # certain Perl-extension properties that should be available in all
12785     # releases.
12786
12787     my $Nl = Range_List->new();
12788     if (defined (my $official = $gc->table('Nl'))) {
12789         $Nl += $official;
12790     }
12791     else {
12792         $Nl->add_range(0x2160, 0x2182);
12793         $Nl->add_range(0x3007, 0x3007);
12794         $Nl->add_range(0x3021, 0x3029);
12795     }
12796     $Nl->add_range(0xFE20, 0xFE23);
12797     $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
12798                                                             # these were added
12799     return $Nl;
12800 }
12801
12802 sub compile_perl() {
12803     # Create perl-defined tables.  Almost all are part of the pseudo-property
12804     # named 'perl' internally to this program.  Many of these are recommended
12805     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
12806     # on those found there.
12807     # Almost all of these are equivalent to some Unicode property.
12808     # A number of these properties have equivalents restricted to the ASCII
12809     # range, with their names prefaced by 'Posix', to signify that these match
12810     # what the Posix standard says they should match.  A couple are
12811     # effectively this, but the name doesn't have 'Posix' in it because there
12812     # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
12813     # to the full Unicode range, by our guesses as to what is appropriate.
12814
12815     # 'Any' is all code points.  As an error check, instead of just setting it
12816     # to be that, construct it to be the union of all the major categories
12817     $Any = $perl->add_match_table('Any',
12818             Description  => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
12819             Matches_All => 1);
12820
12821     foreach my $major_table ($gc->tables) {
12822
12823         # Major categories are the ones with single letter names.
12824         next if length($major_table->name) != 1;
12825
12826         $Any += $major_table;
12827     }
12828
12829     if ($Any->max != $MAX_UNICODE_CODEPOINT) {
12830         Carp::my_carp_bug("Generated highest code point ("
12831            . sprintf("%X", $Any->max)
12832            . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
12833     }
12834     if ($Any->range_count != 1 || $Any->min != 0) {
12835      Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
12836     }
12837
12838     $Any->add_alias('All');
12839
12840     # Assigned is the opposite of gc=unassigned
12841     my $Assigned = $perl->add_match_table('Assigned',
12842                                 Description  => "All assigned code points",
12843                                 Initialize => ~ $gc->table('Unassigned'),
12844                                 );
12845
12846     # Our internal-only property should be treated as more than just a
12847     # synonym; grandfather it in to the pod.
12848     $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
12849                             Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
12850             ->set_equivalent_to(property_ref('ccc')->table('Above'),
12851                                                                 Related => 1);
12852
12853     my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
12854     if (defined $block) {   # This is equivalent to the block if have it.
12855         my $Unicode_ASCII = $block->table('Basic_Latin');
12856         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
12857             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
12858         }
12859     }
12860
12861     # Very early releases didn't have blocks, so initialize ASCII ourselves if
12862     # necessary
12863     if ($ASCII->is_empty) {
12864         $ASCII->add_range(0, 127);
12865     }
12866
12867     # Get the best available case definitions.  Early Unicode versions didn't
12868     # have Uppercase and Lowercase defined, so use the general category
12869     # instead for them, modified by hard-coding in the code points each is
12870     # missing.
12871     my $Lower = $perl->add_match_table('Lower');
12872     my $Unicode_Lower = property_ref('Lowercase');
12873     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
12874         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
12875
12876     }
12877     else {
12878         $Lower += $gc->table('Lowercase_Letter');
12879
12880         # There are quite a few code points in Lower, that aren't in gc=lc,
12881         # and not all are in all releases.
12882         foreach my $code_point (    0x00AA,
12883                                     0x00BA,
12884                                     0x02B0 .. 0x02B8,
12885                                     0x02C0 .. 0x02C1,
12886                                     0x02E0 .. 0x02E4,
12887                                     0x0345,
12888                                     0x037A,
12889                                     0x1D2C .. 0x1D6A,
12890                                     0x1D78,
12891                                     0x1D9B .. 0x1DBF,
12892                                     0x2071,
12893                                     0x207F,
12894                                     0x2090 .. 0x209C,
12895                                     0x2170 .. 0x217F,
12896                                     0x24D0 .. 0x24E9,
12897                                     0x2C7C .. 0x2C7D,
12898                                     0xA770,
12899                                     0xA7F8 .. 0xA7F9,
12900         ) {
12901             # Don't include the code point unless it is assigned in this
12902             # release
12903             my $category = $gc->value_of(hex $code_point);
12904             next if ! defined $category || $category eq 'Cn';
12905
12906             $Lower += $code_point;
12907         }
12908     }
12909     $Lower->add_alias('XPosixLower');
12910     my $Posix_Lower = $perl->add_match_table("PosixLower",
12911                             Description => "[a-z]",
12912                             Initialize => $Lower & $ASCII,
12913                             );
12914
12915     my $Upper = $perl->add_match_table('Upper');
12916     my $Unicode_Upper = property_ref('Uppercase');
12917     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
12918         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
12919     }
12920     else {
12921
12922         # Unlike Lower, there are only two ranges in Upper that aren't in
12923         # gc=Lu, and all code points were assigned in all releases.
12924         $Upper += $gc->table('Uppercase_Letter');
12925         $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
12926         $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
12927     }
12928     $Upper->add_alias('XPosixUpper');
12929     my $Posix_Upper = $perl->add_match_table("PosixUpper",
12930                             Description => "[A-Z]",
12931                             Initialize => $Upper & $ASCII,
12932                             );
12933
12934     # Earliest releases didn't have title case.  Initialize it to empty if not
12935     # otherwise present
12936     my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
12937                                        Description => '(= \p{Gc=Lt})');
12938     my $lt = $gc->table('Lt');
12939
12940     # Earlier versions of mktables had this related to $lt since they have
12941     # identical code points, but their caseless equivalents are not the same,
12942     # one being 'Cased' and the other being 'LC', and so now must be kept as
12943     # separate entities.
12944     if (defined $lt) {
12945         $Title += $lt;
12946     }
12947     else {
12948         push @tables_that_may_be_empty, $Title->complete_name;
12949     }
12950
12951     my $Unicode_Cased = property_ref('Cased');
12952     if (defined $Unicode_Cased) {
12953         my $yes = $Unicode_Cased->table('Y');
12954         my $no = $Unicode_Cased->table('N');
12955         $Title->set_caseless_equivalent($yes);
12956         if (defined $Unicode_Upper) {
12957             $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
12958             $Unicode_Upper->table('N')->set_caseless_equivalent($no);
12959         }
12960         $Upper->set_caseless_equivalent($yes);
12961         if (defined $Unicode_Lower) {
12962             $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
12963             $Unicode_Lower->table('N')->set_caseless_equivalent($no);
12964         }
12965         $Lower->set_caseless_equivalent($yes);
12966     }
12967     else {
12968         # If this Unicode version doesn't have Cased, set up the Perl
12969         # extension from first principles.  From Unicode 5.1: Definition D120:
12970         # A character C is defined to be cased if and only if C has the
12971         # Lowercase or Uppercase property or has a General_Category value of
12972         # Titlecase_Letter.
12973         my $cased = $perl->add_match_table('Cased',
12974                         Initialize => $Lower + $Upper + $Title,
12975                         Description => 'Uppercase or Lowercase or Titlecase',
12976                         );
12977         # $notcased is purely for the caseless equivalents below
12978         my $notcased = $perl->add_match_table('_Not_Cased',
12979                                 Initialize => ~ $cased,
12980                                 Fate => $INTERNAL_ONLY,
12981                                 Description => 'All not-cased code points');
12982         $Title->set_caseless_equivalent($cased);
12983         if (defined $Unicode_Upper) {
12984             $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
12985             $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
12986         }
12987         $Upper->set_caseless_equivalent($cased);
12988         if (defined $Unicode_Lower) {
12989             $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
12990             $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
12991         }
12992         $Lower->set_caseless_equivalent($cased);
12993     }
12994
12995     # Similarly, set up our own Case_Ignorable property if this Unicode
12996     # version doesn't have it.  From Unicode 5.1: Definition D121: A character
12997     # C is defined to be case-ignorable if C has the value MidLetter or the
12998     # value MidNumLet for the Word_Break property or its General_Category is
12999     # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
13000     # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
13001
13002     # Perl has long had an internal-only alias for this property; grandfather
13003     # it in to the pod, but discourage its use.
13004     my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
13005                                                      Re_Pod_Entry => 1,
13006                                                      Fate => $INTERNAL_ONLY,
13007                                                      Status => $DISCOURAGED);
13008     my $case_ignorable = property_ref('Case_Ignorable');
13009     if (defined $case_ignorable && ! $case_ignorable->is_empty) {
13010         $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
13011                                                                 Related => 1);
13012     }
13013     else {
13014
13015         $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
13016
13017         # The following three properties are not in early releases
13018         $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
13019         $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
13020         $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
13021
13022         # For versions 4.1 - 5.0, there is no MidNumLet property, and
13023         # correspondingly the case-ignorable definition lacks that one.  For
13024         # 4.0, it appears that it was meant to be the same definition, but was
13025         # inadvertently omitted from the standard's text, so add it if the
13026         # property actually is there
13027         my $wb = property_ref('Word_Break');
13028         if (defined $wb) {
13029             my $midlet = $wb->table('MidLetter');
13030             $perl_case_ignorable += $midlet if defined $midlet;
13031             my $midnumlet = $wb->table('MidNumLet');
13032             $perl_case_ignorable += $midnumlet if defined $midnumlet;
13033         }
13034         else {
13035
13036             # In earlier versions of the standard, instead of the above two
13037             # properties , just the following characters were used:
13038             $perl_case_ignorable +=  0x0027  # APOSTROPHE
13039                                 +   0x00AD  # SOFT HYPHEN (SHY)
13040                                 +   0x2019; # RIGHT SINGLE QUOTATION MARK
13041         }
13042     }
13043
13044     # The remaining perl defined tables are mostly based on Unicode TR 18,
13045     # "Annex C: Compatibility Properties".  All of these have two versions,
13046     # one whose name generally begins with Posix that is posix-compliant, and
13047     # one that matches Unicode characters beyond the Posix, ASCII range
13048
13049     my $Alpha = $perl->add_match_table('Alpha');
13050
13051     # Alphabetic was not present in early releases
13052     my $Alphabetic = property_ref('Alphabetic');
13053     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
13054         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
13055     }
13056     else {
13057
13058         # The Alphabetic property doesn't exist for early releases, so
13059         # generate it.  The actual definition, in 5.2 terms is:
13060         #
13061         # gc=L + gc=Nl + Other_Alphabetic
13062         #
13063         # Other_Alphabetic is also not defined in these early releases, but it
13064         # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
13065         # those last two as well, then subtract the relatively few of them that
13066         # shouldn't have been added.  (The gc=So range is the circled capital
13067         # Latin characters.  Early releases mistakenly didn't also include the
13068         # lower-case versions of these characters, and so we don't either, to
13069         # maintain consistency with those releases that first had this
13070         # property.
13071         $Alpha->initialize($gc->table('Letter')
13072                            + pre_3_dot_1_Nl()
13073                            + $gc->table('Mn')
13074                            + $gc->table('Mc')
13075                         );
13076         $Alpha->add_range(0x24D0, 0x24E9);  # gc=So
13077         foreach my $range (     [ 0x0300, 0x0344 ],
13078                                 [ 0x0346, 0x034E ],
13079                                 [ 0x0360, 0x0362 ],
13080                                 [ 0x0483, 0x0486 ],
13081                                 [ 0x0591, 0x05AF ],
13082                                 [ 0x06DF, 0x06E0 ],
13083                                 [ 0x06EA, 0x06EC ],
13084                                 [ 0x0740, 0x074A ],
13085                                 0x093C,
13086                                 0x094D,
13087                                 [ 0x0951, 0x0954 ],
13088                                 0x09BC,
13089                                 0x09CD,
13090                                 0x0A3C,
13091                                 0x0A4D,
13092                                 0x0ABC,
13093                                 0x0ACD,
13094                                 0x0B3C,
13095                                 0x0B4D,
13096                                 0x0BCD,
13097                                 0x0C4D,
13098                                 0x0CCD,
13099                                 0x0D4D,
13100                                 0x0DCA,
13101                                 [ 0x0E47, 0x0E4C ],
13102                                 0x0E4E,
13103                                 [ 0x0EC8, 0x0ECC ],
13104                                 [ 0x0F18, 0x0F19 ],
13105                                 0x0F35,
13106                                 0x0F37,
13107                                 0x0F39,
13108                                 [ 0x0F3E, 0x0F3F ],
13109                                 [ 0x0F82, 0x0F84 ],
13110                                 [ 0x0F86, 0x0F87 ],
13111                                 0x0FC6,
13112                                 0x1037,
13113                                 0x1039,
13114                                 [ 0x17C9, 0x17D3 ],
13115                                 [ 0x20D0, 0x20DC ],
13116                                 0x20E1,
13117                                 [ 0x302A, 0x302F ],
13118                                 [ 0x3099, 0x309A ],
13119                                 [ 0xFE20, 0xFE23 ],
13120                                 [ 0x1D165, 0x1D169 ],
13121                                 [ 0x1D16D, 0x1D172 ],
13122                                 [ 0x1D17B, 0x1D182 ],
13123                                 [ 0x1D185, 0x1D18B ],
13124                                 [ 0x1D1AA, 0x1D1AD ],
13125         ) {
13126             if (ref $range) {
13127                 $Alpha->delete_range($range->[0], $range->[1]);
13128             }
13129             else {
13130                 $Alpha->delete_range($range, $range);
13131             }
13132         }
13133         $Alpha->add_description('Alphabetic');
13134         $Alpha->add_alias('Alphabetic');
13135     }
13136     $Alpha->add_alias('XPosixAlpha');
13137     my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
13138                             Description => "[A-Za-z]",
13139                             Initialize => $Alpha & $ASCII,
13140                             );
13141     $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
13142     $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
13143
13144     my $Alnum = $perl->add_match_table('Alnum',
13145                         Description => 'Alphabetic and (decimal) Numeric',
13146                         Initialize => $Alpha + $gc->table('Decimal_Number'),
13147                         );
13148     $Alnum->add_alias('XPosixAlnum');
13149     $perl->add_match_table("PosixAlnum",
13150                             Description => "[A-Za-z0-9]",
13151                             Initialize => $Alnum & $ASCII,
13152                             );
13153
13154     my $Word = $perl->add_match_table('Word',
13155                                 Description => '\w, including beyond ASCII;'
13156                                             . ' = \p{Alnum} + \pM + \p{Pc}',
13157                                 Initialize => $Alnum + $gc->table('Mark'),
13158                                 );
13159     $Word->add_alias('XPosixWord');
13160     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
13161     if (defined $Pc) {
13162         $Word += $Pc;
13163     }
13164     else {
13165         $Word += ord('_');  # Make sure this is a $Word
13166     }
13167     my $JC = property_ref('Join_Control');  # Wasn't in release 1
13168     if (defined $JC) {
13169         $Word += $JC->table('Y');
13170     }
13171     else {
13172         $Word += 0x200C + 0x200D;
13173     }
13174
13175     # This is a Perl extension, so the name doesn't begin with Posix.
13176     my $PerlWord = $perl->add_match_table('PerlWord',
13177                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
13178                     Initialize => $Word & $ASCII,
13179                     );
13180     $PerlWord->add_alias('PosixWord');
13181
13182     my $Blank = $perl->add_match_table('Blank',
13183                                 Description => '\h, Horizontal white space',
13184
13185                                 # 200B is Zero Width Space which is for line
13186                                 # break control, and was listed as
13187                                 # Space_Separator in early releases
13188                                 Initialize => $gc->table('Space_Separator')
13189                                             +   0x0009  # TAB
13190                                             -   0x200B, # ZWSP
13191                                 );
13192     $Blank->add_alias('HorizSpace');        # Another name for it.
13193     $Blank->add_alias('XPosixBlank');
13194     $perl->add_match_table("PosixBlank",
13195                             Description => "\\t and ' '",
13196                             Initialize => $Blank & $ASCII,
13197                             );
13198
13199     my $VertSpace = $perl->add_match_table('VertSpace',
13200                             Description => '\v',
13201                             Initialize => $gc->table('Line_Separator')
13202                                         + $gc->table('Paragraph_Separator')
13203                                         + 0x000A  # LINE FEED
13204                                         + 0x000B  # VERTICAL TAB
13205                                         + 0x000C  # FORM FEED
13206                                         + 0x000D  # CARRIAGE RETURN
13207                                         + 0x0085, # NEL
13208                             );
13209     # No Posix equivalent for vertical space
13210
13211     my $Space = $perl->add_match_table('Space',
13212                 Description => '\s including beyond ASCII plus vertical tab',
13213                 Initialize => $Blank + $VertSpace,
13214     );
13215     $Space->add_alias('XPosixSpace');
13216     $perl->add_match_table("PosixSpace",
13217                             Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
13218                             Initialize => $Space & $ASCII,
13219                             );
13220
13221     # Perl's traditional space doesn't include Vertical Tab
13222     my $XPerlSpace = $perl->add_match_table('XPerlSpace',
13223                                   Description => '\s, including beyond ASCII',
13224                                   #Initialize => $Space - 0x000B,
13225                                   Initialize => $Space,
13226                                 );
13227     $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
13228     my $PerlSpace = $perl->add_match_table('PerlSpace',
13229                         Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
13230                         Initialize => $XPerlSpace & $ASCII,
13231                             );
13232
13233
13234     my $Cntrl = $perl->add_match_table('Cntrl',
13235                                         Description => 'Control characters');
13236     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
13237     $Cntrl->add_alias('XPosixCntrl');
13238     $perl->add_match_table("PosixCntrl",
13239                             Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
13240                             Initialize => $Cntrl & $ASCII,
13241                             );
13242
13243     # $controls is a temporary used to construct Graph.
13244     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
13245                                                 + $gc->table('Control'));
13246     # Cs not in release 1
13247     $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
13248
13249     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
13250     my $Graph = $perl->add_match_table('Graph',
13251                         Description => 'Characters that are graphical',
13252                         Initialize => ~ ($Space + $controls),
13253                         );
13254     $Graph->add_alias('XPosixGraph');
13255     $perl->add_match_table("PosixGraph",
13256                             Description =>
13257                                 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
13258                             Initialize => $Graph & $ASCII,
13259                             );
13260
13261     $print = $perl->add_match_table('Print',
13262                         Description => 'Characters that are graphical plus space characters (but no controls)',
13263                         Initialize => $Blank + $Graph - $gc->table('Control'),
13264                         );
13265     $print->add_alias('XPosixPrint');
13266     $perl->add_match_table("PosixPrint",
13267                             Description =>
13268                               '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
13269                             Initialize => $print & $ASCII,
13270                             );
13271
13272     my $Punct = $perl->add_match_table('Punct');
13273     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
13274
13275     # \p{punct} doesn't include the symbols, which posix does
13276     my $XPosixPunct = $perl->add_match_table('XPosixPunct',
13277                     Description => '\p{Punct} + ASCII-range \p{Symbol}',
13278                     Initialize => $gc->table('Punctuation')
13279                                 + ($ASCII & $gc->table('Symbol')),
13280                                 Perl_Extension => 1
13281         );
13282     $perl->add_match_table('PosixPunct', Perl_Extension => 1,
13283         Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
13284         Initialize => $ASCII & $XPosixPunct,
13285         );
13286
13287     my $Digit = $perl->add_match_table('Digit',
13288                             Description => '[0-9] + all other decimal digits');
13289     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
13290     $Digit->add_alias('XPosixDigit');
13291     my $PosixDigit = $perl->add_match_table("PosixDigit",
13292                                             Description => '[0-9]',
13293                                             Initialize => $Digit & $ASCII,
13294                                             );
13295
13296     # Hex_Digit was not present in first release
13297     my $Xdigit = $perl->add_match_table('XDigit');
13298     $Xdigit->add_alias('XPosixXDigit');
13299     my $Hex = property_ref('Hex_Digit');
13300     if (defined $Hex && ! $Hex->is_empty) {
13301         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
13302     }
13303     else {
13304         # (Have to use hex instead of e.g. '0', because could be running on an
13305         # non-ASCII machine, and we want the Unicode (ASCII) values)
13306         $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
13307                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
13308         $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
13309     }
13310
13311     # AHex was not present in early releases
13312     my $PosixXDigit = $perl->add_match_table('PosixXDigit');
13313     my $AHex = property_ref('ASCII_Hex_Digit');
13314     if (defined $AHex && ! $AHex->is_empty) {
13315         $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
13316     }
13317     else {
13318         $PosixXDigit->initialize($Xdigit & $ASCII);
13319         $PosixXDigit->add_alias('AHex');
13320         $PosixXDigit->add_alias('Ascii_Hex_Digit');
13321     }
13322     $PosixXDigit->add_description('[0-9A-Fa-f]');
13323
13324     my $dt = property_ref('Decomposition_Type');
13325     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
13326         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
13327         Perl_Extension => 1,
13328         Note => 'Union of all non-canonical decompositions',
13329         );
13330
13331     # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
13332     # than SD appeared, construct it ourselves, based on the first release SD
13333     # was in.  A pod entry is grandfathered in for it
13334     my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
13335                                            Perl_Extension => 1,
13336                                            Fate => $INTERNAL_ONLY,
13337                                            Status => $DISCOURAGED);
13338     my $soft_dotted = property_ref('Soft_Dotted');
13339     if (defined $soft_dotted && ! $soft_dotted->is_empty) {
13340         $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
13341     }
13342     else {
13343
13344         # This list came from 3.2 Soft_Dotted; all of these code points are in
13345         # all releases
13346         $CanonDCIJ->initialize([ 0x0069,
13347                                  0x006A,
13348                                  0x012F,
13349                                  0x0268,
13350                                  0x0456,
13351                                  0x0458,
13352                                  0x1E2D,
13353                                  0x1ECB,
13354                                ]);
13355         $CanonDCIJ = $CanonDCIJ & $Assigned;
13356     }
13357
13358     # For backward compatibility, Perl has its own definition for IDStart.
13359     # It is regular XID_Start plus the underscore, but all characters must be
13360     # Word characters as well
13361     my $XID_Start = property_ref('XID_Start');
13362     my $perl_xids = $perl->add_match_table('_Perl_IDStart',
13363                                             Perl_Extension => 1,
13364                                             Fate => $INTERNAL_ONLY,
13365                                             Initialize => ord('_')
13366                                             );
13367     if (defined $XID_Start
13368         || defined ($XID_Start = property_ref('ID_Start')))
13369     {
13370         $perl_xids += $XID_Start->table('Y');
13371     }
13372     else {
13373         # For Unicode versions that don't have the property, construct our own
13374         # from first principles.  The actual definition is:
13375         #     Letters
13376         #   + letter numbers (Nl)
13377         #   - Pattern_Syntax
13378         #   - Pattern_White_Space
13379         #   + stability extensions
13380         #   - NKFC modifications
13381         #
13382         # What we do in the code below is to include the identical code points
13383         # that are in the first release that had Unicode's version of this
13384         # property, essentially extrapolating backwards.  There were no
13385         # stability extensions until v4.1, so none are included; likewise in
13386         # no Unicode version so far do subtracting PatSyn and PatWS make any
13387         # difference, so those also are ignored.
13388         $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
13389
13390         # We do subtract the NFKC modifications that are in the first version
13391         # that had this property.  We don't bother to test if they are in the
13392         # version in question, because if they aren't, the operation is a
13393         # no-op.  The NKFC modifications are discussed in
13394         # http://www.unicode.org/reports/tr31/#NFKC_Modifications
13395         foreach my $range ( 0x037A,
13396                             0x0E33,
13397                             0x0EB3,
13398                             [ 0xFC5E, 0xFC63 ],
13399                             [ 0xFDFA, 0xFE70 ],
13400                             [ 0xFE72, 0xFE76 ],
13401                             0xFE78,
13402                             0xFE7A,
13403                             0xFE7C,
13404                             0xFE7E,
13405                             [ 0xFF9E, 0xFF9F ],
13406         ) {
13407             if (ref $range) {
13408                 $perl_xids->delete_range($range->[0], $range->[1]);
13409             }
13410             else {
13411                 $perl_xids->delete_range($range, $range);
13412             }
13413         }
13414     }
13415
13416     $perl_xids &= $Word;
13417
13418     my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
13419                                         Perl_Extension => 1,
13420                                         Fate => $INTERNAL_ONLY);
13421     my $XIDC = property_ref('XID_Continue');
13422     if (defined $XIDC
13423         || defined ($XIDC = property_ref('ID_Continue')))
13424     {
13425         $perl_xidc += $XIDC->table('Y');
13426     }
13427     else {
13428         # Similarly, we construct our own XIDC if necessary for early Unicode
13429         # versions.  The definition is:
13430         #     everything in XIDS
13431         #   + Gc=Mn
13432         #   + Gc=Mc
13433         #   + Gc=Nd
13434         #   + Gc=Pc
13435         #   - Pattern_Syntax
13436         #   - Pattern_White_Space
13437         #   + stability extensions
13438         #   - NFKC modifications
13439         #
13440         # The same thing applies to this as with XIDS for the PatSyn, PatWS,
13441         # and stability extensions.  There is a somewhat different set of NFKC
13442         # mods to remove (and add in this case).  The ones below make this
13443         # have identical code points as in the first release that defined it.
13444         $perl_xidc += $perl_xids
13445                     + $gc->table('L')
13446                     + $gc->table('Mn')
13447                     + $gc->table('Mc')
13448                     + $gc->table('Nd')
13449                     + 0x00B7
13450                     ;
13451         if (defined (my $pc = $gc->table('Pc'))) {
13452             $perl_xidc += $pc;
13453         }
13454         else {  # 1.1.5 didn't have Pc, but these should have been in it
13455             $perl_xidc += 0xFF3F;
13456             $perl_xidc->add_range(0x203F, 0x2040);
13457             $perl_xidc->add_range(0xFE33, 0xFE34);
13458             $perl_xidc->add_range(0xFE4D, 0xFE4F);
13459         }
13460
13461         # Subtract the NFKC mods
13462         foreach my $range ( 0x037A,
13463                             [ 0xFC5E, 0xFC63 ],
13464                             [ 0xFDFA, 0xFE1F ],
13465                             0xFE70,
13466                             [ 0xFE72, 0xFE76 ],
13467                             0xFE78,
13468                             0xFE7A,
13469                             0xFE7C,
13470                             0xFE7E,
13471         ) {
13472             if (ref $range) {
13473                 $perl_xidc->delete_range($range->[0], $range->[1]);
13474             }
13475             else {
13476                 $perl_xidc->delete_range($range, $range);
13477             }
13478         }
13479     }
13480
13481     $perl_xidc &= $Word;
13482
13483     # These two tables are for matching \X, which is based on the 'extended'
13484     # grapheme cluster, which came in 5.1; create empty ones if not already
13485     # present.  The straight 'grapheme cluster' (non-extended) is used prior
13486     # to 5.1, and differs from the extended (see
13487     # http://www.unicode.org/reports/tr29/) only by these two tables, so we
13488     # get the older definition automatically when they are empty.
13489     my $gcb = property_ref('Grapheme_Cluster_Break');
13490     my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
13491                                         Perl_Extension => 1,
13492                                         Fate => $INTERNAL_ONLY);
13493     if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
13494         $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
13495     }
13496     else {
13497         push @tables_that_may_be_empty, $perl_prepend->complete_name;
13498     }
13499
13500     # All the tables with _X_ in their names are used in defining \X handling,
13501     # and are based on the Unicode GCB property.  Basically, \X matches:
13502     #   CR-LF
13503     #   | Prepend* Begin Extend*
13504     #   | .
13505     # Begin is:           ( Special_Begin | ! Control )
13506     # Begin is also:      ( Regular_Begin | Special_Begin )
13507     #   where Regular_Begin is defined as ( ! Control - Special_Begin )
13508     # Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
13509     # Extend is:          ( Grapheme_Extend | Spacing_Mark )
13510     # Control is:         [ GCB_Control CR LF ]
13511     # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
13512
13513     foreach my $gcb_name (qw{ L V T LV LVT }) {
13514
13515         # The perl internal extension's name is the gcb table name prepended
13516         # with an '_X_'
13517         my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
13518                                         Perl_Extension => 1,
13519                                         Fate => $INTERNAL_ONLY,
13520                                         Initialize => $gcb->table($gcb_name),
13521                                         );
13522         # Version 1 had mostly different Hangul syllables that were removed
13523         # from later versions, so some of the tables may not apply.
13524         if ($v_version lt v2.0) {
13525             push @tables_that_may_be_empty, $perl_table->complete_name;
13526         }
13527     }
13528
13529     # More GCB.  Populate a combined hangul syllables table
13530     my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
13531                                           Perl_Extension => 1,
13532                                           Fate => $INTERNAL_ONLY);
13533     $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
13534     $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
13535
13536     my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
13537                                     Fate => $INTERNAL_ONLY);
13538     if ($v_version ge v6.2) {
13539         $ri += $gcb->table('RI');
13540     }
13541     else {
13542         push @tables_that_may_be_empty, $ri->full_name;
13543     }
13544
13545     my $specials_begin = $perl->add_match_table('_X_Special_Begin',
13546                                        Perl_Extension => 1,
13547                                        Fate => $INTERNAL_ONLY,
13548                                        Initialize => $lv_lvt_v
13549                                                    + $gcb->table('L')
13550                                                    + $gcb->table('T')
13551                                                    + $ri
13552                                       );
13553     $specials_begin->add_comment(join_lines( <<END
13554 For use in \\X; matches first character of potential multi-character
13555 sequences that can begin an extended grapheme cluster.  They need special
13556 handling because of their complicated nature.
13557 END
13558     ));
13559     my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
13560                                        Perl_Extension => 1,
13561                                        Fate => $INTERNAL_ONLY,
13562                                        Initialize => ~ $gcb->table('Control')
13563                                                    - $specials_begin
13564                                                    - $gcb->table('CR')
13565                                                    - $gcb->table('LF')
13566                                       );
13567     $regular_begin->add_comment(join_lines( <<END
13568 For use in \\X; matches first character of anything that can begin an extended
13569 grapheme cluster, except those that require special handling.
13570 END
13571     ));
13572
13573     my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
13574                                         Fate => $INTERNAL_ONLY,
13575                                         Initialize => $gcb->table('Extend')
13576                                        );
13577     if (defined (my $sm = $gcb->table('SpacingMark'))) {
13578         $extend += $sm;
13579     }
13580     $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
13581
13582     # End of GCB \X processing
13583
13584     my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
13585
13586     if (@named_sequences) {
13587         push @composition, 'Named_Sequence';
13588         foreach my $sequence (@named_sequences) {
13589             $perl_charname->add_anomalous_entry($sequence);
13590         }
13591     }
13592
13593     my $alias_sentence = "";
13594     my %abbreviations;
13595     my $alias = property_ref('Name_Alias');
13596     $perl_charname->set_proxy_for('Name_Alias');
13597
13598     # Add each entry in Name_Alias to Perl_Charnames.  Where these go with
13599     # respect to any existing entry depends on the entry type.  Corrections go
13600     # before said entry, as they should be returned in preference over the
13601     # existing entry.  (A correction to a correction should be later in the
13602     # Name_Alias table, so it will correctly precede the erroneous correction
13603     # in Perl_Charnames.)
13604     #
13605     # Abbreviations go after everything else, so they are saved temporarily in
13606     # a hash for later.
13607     #
13608     # Everything else is added added afterwards, which preserves the input
13609     # ordering
13610
13611     foreach my $range ($alias->ranges) {
13612         next if $range->value eq "";
13613         my $code_point = $range->start;
13614         if ($code_point != $range->end) {
13615             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
13616         }
13617         my ($value, $type) = split ': ', $range->value;
13618         my $replace_type;
13619         if ($type eq 'correction') {
13620             $replace_type = $MULTIPLE_BEFORE;
13621         }
13622         elsif ($type eq 'abbreviation') {
13623
13624             # Save for later
13625             $abbreviations{$value} = $code_point;
13626             next;
13627         }
13628         else {
13629             $replace_type = $MULTIPLE_AFTER;
13630         }
13631
13632         # Actually add; before or after current entry(ies) as determined
13633         # above.
13634
13635         $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
13636     }
13637     $alias_sentence = <<END;
13638 The Name_Alias property adds duplicate code point entries that are
13639 alternatives to the original name.  If an addition is a corrected
13640 name, it will be physically first in the table.  The original (less correct,
13641 but still valid) name will be next; then any alternatives, in no particular
13642 order; and finally any abbreviations, again in no particular order.
13643 END
13644
13645     # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
13646     # precedence before 6.1, so should be first in the file; the other names
13647     # have precedence starting in 6.1,
13648     my $before_or_after = ($v_version lt v6.1.0)
13649                           ? $MULTIPLE_BEFORE
13650                           : $MULTIPLE_AFTER;
13651
13652     foreach my $range (property_ref('Unicode_1_Name')->ranges) {
13653         my $code_point = $range->start;
13654         my $unicode_1_value = $range->value;
13655         next if $unicode_1_value eq "";     # Skip if name doesn't exist.
13656
13657         if ($code_point != $range->end) {
13658             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
13659         }
13660
13661         # To handle EBCDIC, we don't hard code in the code points of the
13662         # controls; instead realizing that all of them are below 256.
13663         last if $code_point > 255;
13664
13665         # We only add in the controls.
13666         next if $gc->value_of($code_point) ne 'Cc';
13667
13668         # We reject this Unicode1 name for later Perls, as it is used for
13669         # another code point
13670         next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
13671
13672         # This won't add an exact duplicate.
13673         $perl_charname->add_duplicate($code_point, $unicode_1_value,
13674                                         Replace => $before_or_after);
13675     }
13676
13677     # But in this version only, the ALERT has precedence over BELL, the
13678     # Unicode_1_Name that would otherwise have precedence.
13679     if ($v_version eq v6.0.0) {
13680         $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE);
13681     }
13682
13683     # Now that have everything added, add in abbreviations after
13684     # everything else.
13685     foreach my $value (keys %abbreviations) {
13686         $perl_charname->add_duplicate($abbreviations{$value}, $value,
13687                                         Replace => $MULTIPLE_AFTER);
13688     }
13689
13690     my $comment;
13691     if (@composition <= 2) { # Always at least 2
13692         $comment = join " and ", @composition;
13693     }
13694     else {
13695         $comment = join ", ", @composition[0 .. scalar @composition - 2];
13696         $comment .= ", and $composition[-1]";
13697     }
13698
13699     $perl_charname->add_comment(join_lines( <<END
13700 This file is for charnames.pm.  It is the union of the $comment properties.
13701 Unicode_1_Name entries are used only for nameless code points in the Name
13702 property.
13703 $alias_sentence
13704 This file doesn't include the algorithmically determinable names.  For those,
13705 use 'unicore/Name.pm'
13706 END
13707     ));
13708     property_ref('Name')->add_comment(join_lines( <<END
13709 This file doesn't include the algorithmically determinable names.  For those,
13710 use 'unicore/Name.pm'
13711 END
13712     ));
13713
13714     # Construct the Present_In property from the Age property.
13715     if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
13716         my $default_map = $age->default_map;
13717         my $in = Property->new('In',
13718                                 Default_Map => $default_map,
13719                                 Full_Name => "Present_In",
13720                                 Perl_Extension => 1,
13721                                 Type => $ENUM,
13722                                 Initialize => $age,
13723                                 );
13724         $in->add_comment(join_lines(<<END
13725 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
13726 same as for $age, and not for what $in really means.  This is because anything
13727 defined in a given release should have multiple values: that release and all
13728 higher ones.  But only one value per code point can be represented in a table
13729 like this.
13730 END
13731         ));
13732
13733         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
13734         # lowest numbered (earliest) come first, with the non-numeric one
13735         # last.
13736         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
13737                                             ? 1
13738                                             : ($b->name !~ /^[\d.]*$/)
13739                                                 ? -1
13740                                                 : $a->name <=> $b->name
13741                                             } $age->tables;
13742
13743         # The Present_In property is the cumulative age properties.  The first
13744         # one hence is identical to the first age one.
13745         my $previous_in = $in->add_match_table($first_age->name);
13746         $previous_in->set_equivalent_to($first_age, Related => 1);
13747
13748         my $description_start = "Code point's usage introduced in version ";
13749         $first_age->add_description($description_start . $first_age->name);
13750
13751         # To construct the accumulated values, for each of the age tables
13752         # starting with the 2nd earliest, merge the earliest with it, to get
13753         # all those code points existing in the 2nd earliest.  Repeat merging
13754         # the new 2nd earliest with the 3rd earliest to get all those existing
13755         # in the 3rd earliest, and so on.
13756         foreach my $current_age (@rest_ages) {
13757             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
13758
13759             my $current_in = $in->add_match_table(
13760                                     $current_age->name,
13761                                     Initialize => $current_age + $previous_in,
13762                                     Description => $description_start
13763                                                     . $current_age->name
13764                                                     . ' or earlier',
13765                                     );
13766             $previous_in = $current_in;
13767
13768             # Add clarifying material for the corresponding age file.  This is
13769             # in part because of the confusing and contradictory information
13770             # given in the Standard's documentation itself, as of 5.2.
13771             $current_age->add_description(
13772                             "Code point's usage was introduced in version "
13773                             . $current_age->name);
13774             $current_age->add_note("See also $in");
13775
13776         }
13777
13778         # And finally the code points whose usages have yet to be decided are
13779         # the same in both properties.  Note that permanently unassigned code
13780         # points actually have their usage assigned (as being permanently
13781         # unassigned), so that these tables are not the same as gc=cn.
13782         my $unassigned = $in->add_match_table($default_map);
13783         my $age_default = $age->table($default_map);
13784         $age_default->add_description(<<END
13785 Code point's usage has not been assigned in any Unicode release thus far.
13786 END
13787         );
13788         $unassigned->set_equivalent_to($age_default, Related => 1);
13789     }
13790
13791     # See L<perlfunc/quotemeta>
13792     my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
13793                                            Perl_Extension => 1,
13794                                            Fate => $INTERNAL_ONLY,
13795
13796                                            # Initialize to what's common in
13797                                            # all Unicode releases.
13798                                            Initialize =>
13799                                                 $Space
13800                                                 + $gc->table('Control')
13801                            );
13802
13803     # In early releases without the proper Unicode properties, just set to \W.
13804     if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
13805         || ! defined (my $patws = property_ref('Pattern_White_Space'))
13806         || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
13807     {
13808         $quotemeta += ~ $Word;
13809     }
13810     else {
13811         $quotemeta += $patsyn->table('Y')
13812                    + $patws->table('Y')
13813                    + $di->table('Y')
13814                    + ((~ $Word) & $ASCII);
13815     }
13816
13817     # Finished creating all the perl properties.  All non-internal non-string
13818     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
13819     # an underscore.)  These do not get a separate entry in the pod file
13820     foreach my $table ($perl->tables) {
13821         foreach my $alias ($table->aliases) {
13822             next if $alias->name =~ /^_/;
13823             $table->add_alias('Is_' . $alias->name,
13824                                Re_Pod_Entry => 0,
13825                                UCD => 0,
13826                                Status => $alias->status,
13827                                OK_as_Filename => 0);
13828         }
13829     }
13830
13831     # Here done with all the basic stuff.  Ready to populate the information
13832     # about each character if annotating them.
13833     if ($annotate) {
13834
13835         # See comments at its declaration
13836         $annotate_ranges = Range_Map->new;
13837
13838         # This separates out the non-characters from the other unassigneds, so
13839         # can give different annotations for each.
13840         $unassigned_sans_noncharacters = Range_List->new(
13841                                     Initialize => $gc->table('Unassigned'));
13842         if (defined (my $nonchars = property_ref('Noncharacter_Code_Point'))) {
13843             $unassigned_sans_noncharacters &= $nonchars->table('N');
13844         }
13845
13846         for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
13847             $i = populate_char_info($i);    # Note sets $i so may cause skips
13848         }
13849     }
13850
13851     return;
13852 }
13853
13854 sub add_perl_synonyms() {
13855     # A number of Unicode tables have Perl synonyms that are expressed in
13856     # the single-form, \p{name}.  These are:
13857     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
13858     #       \p{Is_Name} as synonyms
13859     #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
13860     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
13861     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
13862     #       conflict, \p{Value} and \p{Is_Value} as well
13863     #
13864     # This routine generates these synonyms, warning of any unexpected
13865     # conflicts.
13866
13867     # Construct the list of tables to get synonyms for.  Start with all the
13868     # binary and the General_Category ones.
13869     my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
13870                                                             property_ref('*');
13871     push @tables, $gc->tables;
13872
13873     # If the version of Unicode includes the Script property, add its tables
13874     push @tables, $script->tables if defined $script;
13875
13876     # The Block tables are kept separate because they are treated differently.
13877     # And the earliest versions of Unicode didn't include them, so add only if
13878     # there are some.
13879     my @blocks;
13880     push @blocks, $block->tables if defined $block;
13881
13882     # Here, have the lists of tables constructed.  Process blocks last so that
13883     # if there are name collisions with them, blocks have lowest priority.
13884     # Should there ever be other collisions, manual intervention would be
13885     # required.  See the comments at the beginning of the program for a
13886     # possible way to handle those semi-automatically.
13887     foreach my $table (@tables,  @blocks) {
13888
13889         # For non-binary properties, the synonym is just the name of the
13890         # table, like Greek, but for binary properties the synonym is the name
13891         # of the property, and means the code points in its 'Y' table.
13892         my $nominal = $table;
13893         my $nominal_property = $nominal->property;
13894         my $actual;
13895         if (! $nominal->isa('Property')) {
13896             $actual = $table;
13897         }
13898         else {
13899
13900             # Here is a binary property.  Use the 'Y' table.  Verify that is
13901             # there
13902             my $yes = $nominal->table('Y');
13903             unless (defined $yes) {  # Must be defined, but is permissible to
13904                                      # be empty.
13905                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
13906                 next;
13907             }
13908             $actual = $yes;
13909         }
13910
13911         foreach my $alias ($nominal->aliases) {
13912
13913             # Attempt to create a table in the perl directory for the
13914             # candidate table, using whatever aliases in it that don't
13915             # conflict.  Also add non-conflicting aliases for all these
13916             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
13917             PREFIX:
13918             foreach my $prefix ("", 'Is_', 'In_') {
13919
13920                 # Only Block properties can have added 'In_' aliases.
13921                 next if $prefix eq 'In_' and $nominal_property != $block;
13922
13923                 my $proposed_name = $prefix . $alias->name;
13924
13925                 # No Is_Is, In_In, nor combinations thereof
13926                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
13927                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
13928
13929                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
13930
13931                 # Get a reference to any existing table in the perl
13932                 # directory with the desired name.
13933                 my $pre_existing = $perl->table($proposed_name);
13934
13935                 if (! defined $pre_existing) {
13936
13937                     # No name collision, so ok to add the perl synonym.
13938
13939                     my $make_re_pod_entry;
13940                     my $ok_as_filename;
13941                     my $status = $alias->status;
13942                     if ($nominal_property == $block) {
13943
13944                         # For block properties, the 'In' form is preferred for
13945                         # external use; the pod file contains wild cards for
13946                         # this and the 'Is' form so no entries for those; and
13947                         # we don't want people using the name without the
13948                         # 'In', so discourage that.
13949                         if ($prefix eq "") {
13950                             $make_re_pod_entry = 1;
13951                             $status = $status || $DISCOURAGED;
13952                             $ok_as_filename = 0;
13953                         }
13954                         elsif ($prefix eq 'In_') {
13955                             $make_re_pod_entry = 0;
13956                             $status = $status || $NORMAL;
13957                             $ok_as_filename = 1;
13958                         }
13959                         else {
13960                             $make_re_pod_entry = 0;
13961                             $status = $status || $DISCOURAGED;
13962                             $ok_as_filename = 0;
13963                         }
13964                     }
13965                     elsif ($prefix ne "") {
13966
13967                         # The 'Is' prefix is handled in the pod by a wild
13968                         # card, and we won't use it for an external name
13969                         $make_re_pod_entry = 0;
13970                         $status = $status || $NORMAL;
13971                         $ok_as_filename = 0;
13972                     }
13973                     else {
13974
13975                         # Here, is an empty prefix, non block.  This gets its
13976                         # own pod entry and can be used for an external name.
13977                         $make_re_pod_entry = 1;
13978                         $status = $status || $NORMAL;
13979                         $ok_as_filename = 1;
13980                     }
13981
13982                     # Here, there isn't a perl pre-existing table with the
13983                     # name.  Look through the list of equivalents of this
13984                     # table to see if one is a perl table.
13985                     foreach my $equivalent ($actual->leader->equivalents) {
13986                         next if $equivalent->property != $perl;
13987
13988                         # Here, have found a table for $perl.  Add this alias
13989                         # to it, and are done with this prefix.
13990                         $equivalent->add_alias($proposed_name,
13991                                         Re_Pod_Entry => $make_re_pod_entry,
13992
13993                                         # Currently don't output these in the
13994                                         # ucd pod, as are strongly discouraged
13995                                         # from being used
13996                                         UCD => 0,
13997
13998                                         Status => $status,
13999                                         OK_as_Filename => $ok_as_filename);
14000                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
14001                         next PREFIX;
14002                     }
14003
14004                     # Here, $perl doesn't already have a table that is a
14005                     # synonym for this property, add one.
14006                     my $added_table = $perl->add_match_table($proposed_name,
14007                                             Re_Pod_Entry => $make_re_pod_entry,
14008
14009                                             # See UCD comment just above
14010                                             UCD => 0,
14011
14012                                             Status => $status,
14013                                             OK_as_Filename => $ok_as_filename);
14014                     # And it will be related to the actual table, since it is
14015                     # based on it.
14016                     $added_table->set_equivalent_to($actual, Related => 1);
14017                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
14018                     next;
14019                 } # End of no pre-existing.
14020
14021                 # Here, there is a pre-existing table that has the proposed
14022                 # name.  We could be in trouble, but not if this is just a
14023                 # synonym for another table that we have already made a child
14024                 # of the pre-existing one.
14025                 if ($pre_existing->is_set_equivalent_to($actual)) {
14026                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
14027                     $pre_existing->add_alias($proposed_name);
14028                     next;
14029                 }
14030
14031                 # Here, there is a name collision, but it still could be ok if
14032                 # the tables match the identical set of code points, in which
14033                 # case, we can combine the names.  Compare each table's code
14034                 # point list to see if they are identical.
14035                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
14036                 if ($pre_existing->matches_identically_to($actual)) {
14037
14038                     # Here, they do match identically.  Not a real conflict.
14039                     # Make the perl version a child of the Unicode one, except
14040                     # in the non-obvious case of where the perl name is
14041                     # already a synonym of another Unicode property.  (This is
14042                     # excluded by the test for it being its own parent.)  The
14043                     # reason for this exclusion is that then the two Unicode
14044                     # properties become related; and we don't really know if
14045                     # they are or not.  We generate documentation based on
14046                     # relatedness, and this would be misleading.  Code
14047                     # later executed in the process will cause the tables to
14048                     # be represented by a single file anyway, without making
14049                     # it look in the pod like they are necessarily related.
14050                     if ($pre_existing->parent == $pre_existing
14051                         && ($pre_existing->property == $perl
14052                             || $actual->property == $perl))
14053                     {
14054                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
14055                         $pre_existing->set_equivalent_to($actual, Related => 1);
14056                     }
14057                     elsif (main::DEBUG && $to_trace) {
14058                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
14059                         trace $pre_existing->parent;
14060                     }
14061                     next PREFIX;
14062                 }
14063
14064                 # Here they didn't match identically, there is a real conflict
14065                 # between our new name and a pre-existing property.
14066                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
14067                 $pre_existing->add_conflicting($nominal->full_name,
14068                                                'p',
14069                                                $actual);
14070
14071                 # Don't output a warning for aliases for the block
14072                 # properties (unless they start with 'In_') as it is
14073                 # expected that there will be conflicts and the block
14074                 # form loses.
14075                 if ($verbosity >= $NORMAL_VERBOSITY
14076                     && ($actual->property != $block || $prefix eq 'In_'))
14077                 {
14078                     print simple_fold(join_lines(<<END
14079 There is already an alias named $proposed_name (from $pre_existing),
14080 so not creating this alias for $actual
14081 END
14082                     ), "", 4);
14083                 }
14084
14085                 # Keep track for documentation purposes.
14086                 $has_In_conflicts++ if $prefix eq 'In_';
14087                 $has_Is_conflicts++ if $prefix eq 'Is_';
14088             }
14089         }
14090     }
14091
14092     # There are some properties which have No and Yes (and N and Y) as
14093     # property values, but aren't binary, and could possibly be confused with
14094     # binary ones.  So create caveats for them.  There are tables that are
14095     # named 'No', and tables that are named 'N', but confusion is not likely
14096     # unless they are the same table.  For example, N meaning Number or
14097     # Neutral is not likely to cause confusion, so don't add caveats to things
14098     # like them.
14099     foreach my $property (grep { $_->type != $BINARY
14100                                  && $_->type != $FORCED_BINARY }
14101                                                             property_ref('*'))
14102     {
14103         my $yes = $property->table('Yes');
14104         if (defined $yes) {
14105             my $y = $property->table('Y');
14106             if (defined $y && $yes == $y) {
14107                 foreach my $alias ($property->aliases) {
14108                     $yes->add_conflicting($alias->name);
14109                 }
14110             }
14111         }
14112         my $no = $property->table('No');
14113         if (defined $no) {
14114             my $n = $property->table('N');
14115             if (defined $n && $no == $n) {
14116                 foreach my $alias ($property->aliases) {
14117                     $no->add_conflicting($alias->name, 'P');
14118                 }
14119             }
14120         }
14121     }
14122
14123     return;
14124 }
14125
14126 sub register_file_for_name($$$) {
14127     # Given info about a table and a datafile that it should be associated
14128     # with, register that association
14129
14130     my $table = shift;
14131     my $directory_ref = shift;   # Array of the directory path for the file
14132     my $file = shift;            # The file name in the final directory.
14133     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14134
14135     trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
14136
14137     if ($table->isa('Property')) {
14138         $table->set_file_path(@$directory_ref, $file);
14139         push @map_properties, $table;
14140
14141         # No swash means don't do the rest of this.
14142         return if $table->fate != $ORDINARY;
14143
14144         # Get the path to the file
14145         my @path = $table->file_path;
14146
14147         # Use just the file name if no subdirectory.
14148         shift @path if $path[0] eq File::Spec->curdir();
14149
14150         my $file = join '/', @path;
14151
14152         # Create a hash entry for utf8_heavy to get the file that stores this
14153         # property's map table
14154         foreach my $alias ($table->aliases) {
14155             my $name = $alias->name;
14156             $loose_property_to_file_of{standardize($name)} = $file;
14157         }
14158
14159         # And a way for utf8_heavy to find the proper key in the SwashInfo
14160         # hash for this property.
14161         $file_to_swash_name{$file} = "To" . $table->swash_name;
14162         return;
14163     }
14164
14165     # Do all of the work for all equivalent tables when called with the leader
14166     # table, so skip if isn't the leader.
14167     return if $table->leader != $table;
14168
14169     # If this is a complement of another file, use that other file instead,
14170     # with a ! prepended to it.
14171     my $complement;
14172     if (($complement = $table->complement) != 0) {
14173         my @directories = $complement->file_path;
14174
14175         # This assumes that the 0th element is something like 'lib',
14176         # the 1th element the property name (in its own directory), like
14177         # 'AHex', and the 2th element the file like 'Y' which will have a .pl
14178         # appended to it later.
14179         $directories[1] =~ s/^/!/;
14180         $file = pop @directories;
14181         $directory_ref =\@directories;
14182     }
14183
14184     # Join all the file path components together, using slashes.
14185     my $full_filename = join('/', @$directory_ref, $file);
14186
14187     # All go in the same subdirectory of unicore
14188     if ($directory_ref->[0] ne $matches_directory) {
14189         Carp::my_carp("Unexpected directory in "
14190                 .  join('/', @{$directory_ref}, $file));
14191     }
14192
14193     # For this table and all its equivalents ...
14194     foreach my $table ($table, $table->equivalents) {
14195
14196         # Associate it with its file internally.  Don't include the
14197         # $matches_directory first component
14198         $table->set_file_path(@$directory_ref, $file);
14199
14200         # No swash means don't do the rest of this.
14201         next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
14202
14203         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
14204
14205         my $property = $table->property;
14206         my $property_name = ($property == $perl)
14207                              ? ""  # 'perl' is never explicitly stated
14208                              : standardize($property->name) . '=';
14209
14210         my $is_default = 0; # Is this table the default one for the property?
14211
14212         # To calculate $is_default, we find if this table is the same as the
14213         # default one for the property.  But this is complicated by the
14214         # possibility that there is a master table for this one, and the
14215         # information is stored there instead of here.
14216         my $parent = $table->parent;
14217         my $leader_prop = $parent->property;
14218         my $default_map = $leader_prop->default_map;
14219         if (defined $default_map) {
14220             my $default_table = $leader_prop->table($default_map);
14221             $is_default = 1 if defined $default_table && $parent == $default_table;
14222         }
14223
14224         # Calculate the loose name for this table.  Mostly it's just its name,
14225         # standardized.  But in the case of Perl tables that are single-form
14226         # equivalents to Unicode properties, it is the latter's name.
14227         my $loose_table_name =
14228                         ($property != $perl || $leader_prop == $perl)
14229                         ? standardize($table->name)
14230                         : standardize($parent->name);
14231
14232         my $deprecated = ($table->status eq $DEPRECATED)
14233                          ? $table->status_info
14234                          : "";
14235         my $caseless_equivalent = $table->caseless_equivalent;
14236
14237         # And for each of the table's aliases...  This inner loop eventually
14238         # goes through all aliases in the UCD that we generate regex match
14239         # files for
14240         foreach my $alias ($table->aliases) {
14241             my $standard = utf8_heavy_name($table, $alias);
14242
14243             # Generate an entry in either the loose or strict hashes, which
14244             # will translate the property and alias names combination into the
14245             # file where the table for them is stored.
14246             if ($alias->loose_match) {
14247                 if (exists $loose_to_file_of{$standard}) {
14248                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
14249                 }
14250                 else {
14251                     $loose_to_file_of{$standard} = $sub_filename;
14252                 }
14253             }
14254             else {
14255                 if (exists $stricter_to_file_of{$standard}) {
14256                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
14257                 }
14258                 else {
14259                     $stricter_to_file_of{$standard} = $sub_filename;
14260
14261                     # Tightly coupled with how utf8_heavy.pl works, for a
14262                     # floating point number that is a whole number, get rid of
14263                     # the trailing decimal point and 0's, so that utf8_heavy
14264                     # will work.  Also note that this assumes that such a
14265                     # number is matched strictly; so if that were to change,
14266                     # this would be wrong.
14267                     if ((my $integer_name = $alias->name)
14268                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
14269                     {
14270                         $stricter_to_file_of{$property_name . $integer_name}
14271                                                             = $sub_filename;
14272                     }
14273                 }
14274             }
14275
14276             # For Unicode::UCD, create a mapping of the prop=value to the
14277             # canonical =value for that property.
14278             if ($standard =~ /=/) {
14279
14280                 # This could happen if a strict name mapped into an existing
14281                 # loose name.  In that event, the strict names would have to
14282                 # be moved to a new hash.
14283                 if (exists($loose_to_standard_value{$standard})) {
14284                     Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
14285                 }
14286                 $loose_to_standard_value{$standard} = $loose_table_name;
14287             }
14288
14289             # Keep a list of the deprecated properties and their filenames
14290             if ($deprecated && $complement == 0) {
14291                 $utf8::why_deprecated{$sub_filename} = $deprecated;
14292             }
14293
14294             # And a substitute table, if any, for case-insensitive matching
14295             if ($caseless_equivalent != 0) {
14296                 $caseless_equivalent_to{$standard} = $caseless_equivalent;
14297             }
14298
14299             # Add to defaults list if the table this alias belongs to is the
14300             # default one
14301             $loose_defaults{$standard} = 1 if $is_default;
14302         }
14303     }
14304
14305     return;
14306 }
14307
14308 {   # Closure
14309     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
14310                      # conflicts
14311     my %full_dir_name_of;   # Full length names of directories used.
14312
14313     sub construct_filename($$$) {
14314         # Return a file name for a table, based on the table name, but perhaps
14315         # changed to get rid of non-portable characters in it, and to make
14316         # sure that it is unique on a file system that allows the names before
14317         # any period to be at most 8 characters (DOS).  While we're at it
14318         # check and complain if there are any directory conflicts.
14319
14320         my $name = shift;       # The name to start with
14321         my $mutable = shift;    # Boolean: can it be changed?  If no, but
14322                                 # yet it must be to work properly, a warning
14323                                 # is given
14324         my $directories_ref = shift;  # A reference to an array containing the
14325                                 # path to the file, with each element one path
14326                                 # component.  This is used because the same
14327                                 # name can be used in different directories.
14328         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14329
14330         my $warn = ! defined wantarray;  # If true, then if the name is
14331                                 # changed, a warning is issued as well.
14332
14333         if (! defined $name) {
14334             Carp::my_carp("Undefined name in directory "
14335                           . File::Spec->join(@$directories_ref)
14336                           . ". '_' used");
14337             return '_';
14338         }
14339
14340         # Make sure that no directory names conflict with each other.  Look at
14341         # each directory in the input file's path.  If it is already in use,
14342         # assume it is correct, and is merely being re-used, but if we
14343         # truncate it to 8 characters, and find that there are two directories
14344         # that are the same for the first 8 characters, but differ after that,
14345         # then that is a problem.
14346         foreach my $directory (@$directories_ref) {
14347             my $short_dir = substr($directory, 0, 8);
14348             if (defined $full_dir_name_of{$short_dir}) {
14349                 next if $full_dir_name_of{$short_dir} eq $directory;
14350                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
14351             }
14352             else {
14353                 $full_dir_name_of{$short_dir} = $directory;
14354             }
14355         }
14356
14357         my $path = join '/', @$directories_ref;
14358         $path .= '/' if $path;
14359
14360         # Remove interior underscores.
14361         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
14362
14363         # Change any non-word character into an underscore, and truncate to 8.
14364         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
14365         substr($filename, 8) = "" if length($filename) > 8;
14366
14367         # Make sure the basename doesn't conflict with something we
14368         # might have already written. If we have, say,
14369         #     InGreekExtended1
14370         #     InGreekExtended2
14371         # they become
14372         #     InGreekE
14373         #     InGreek2
14374         my $warned = 0;
14375         while (my $num = $base_names{$path}{lc $filename}++) {
14376             $num++; # so basenames with numbers start with '2', which
14377                     # just looks more natural.
14378
14379             # Want to append $num, but if it'll make the basename longer
14380             # than 8 characters, pre-truncate $filename so that the result
14381             # is acceptable.
14382             my $delta = length($filename) + length($num) - 8;
14383             if ($delta > 0) {
14384                 substr($filename, -$delta) = $num;
14385             }
14386             else {
14387                 $filename .= $num;
14388             }
14389             if ($warn && ! $warned) {
14390                 $warned = 1;
14391                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
14392             }
14393         }
14394
14395         return $filename if $mutable;
14396
14397         # If not changeable, must return the input name, but warn if needed to
14398         # change it beyond shortening it.
14399         if ($name ne $filename
14400             && substr($name, 0, length($filename)) ne $filename) {
14401             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
14402         }
14403         return $name;
14404     }
14405 }
14406
14407 # The pod file contains a very large table.  Many of the lines in that table
14408 # would exceed a typical output window's size, and so need to be wrapped with
14409 # a hanging indent to make them look good.  The pod language is really
14410 # insufficient here.  There is no general construct to do that in pod, so it
14411 # is done here by beginning each such line with a space to cause the result to
14412 # be output without formatting, and doing all the formatting here.  This leads
14413 # to the result that if the eventual display window is too narrow it won't
14414 # look good, and if the window is too wide, no advantage is taken of that
14415 # extra width.  A further complication is that the output may be indented by
14416 # the formatter so that there is less space than expected.  What I (khw) have
14417 # done is to assume that that indent is a particular number of spaces based on
14418 # what it is in my Linux system;  people can always resize their windows if
14419 # necessary, but this is obviously less than desirable, but the best that can
14420 # be expected.
14421 my $automatic_pod_indent = 8;
14422
14423 # Try to format so that uses fewest lines, but few long left column entries
14424 # slide into the right column.  An experiment on 5.1 data yielded the
14425 # following percentages that didn't cut into the other side along with the
14426 # associated first-column widths
14427 # 69% = 24
14428 # 80% not too bad except for a few blocks
14429 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
14430 # 95% = 37;
14431 my $indent_info_column = 27;    # 75% of lines didn't have overlap
14432
14433 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
14434                     # The 3 is because of:
14435                     #   1   for the leading space to tell the pod formatter to
14436                     #       output as-is
14437                     #   1   for the flag
14438                     #   1   for the space between the flag and the main data
14439
14440 sub format_pod_line ($$$;$$) {
14441     # Take a pod line and return it, formatted properly
14442
14443     my $first_column_width = shift;
14444     my $entry = shift;  # Contents of left column
14445     my $info = shift;   # Contents of right column
14446
14447     my $status = shift || "";   # Any flag
14448
14449     my $loose_match = shift;    # Boolean.
14450     $loose_match = 1 unless defined $loose_match;
14451
14452     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14453
14454     my $flags = "";
14455     $flags .= $STRICTER if ! $loose_match;
14456
14457     $flags .= $status if $status;
14458
14459     # There is a blank in the left column to cause the pod formatter to
14460     # output the line as-is.
14461     return sprintf " %-*s%-*s %s\n",
14462                     # The first * in the format is replaced by this, the -1 is
14463                     # to account for the leading blank.  There isn't a
14464                     # hard-coded blank after this to separate the flags from
14465                     # the rest of the line, so that in the unlikely event that
14466                     # multiple flags are shown on the same line, they both
14467                     # will get displayed at the expense of that separation,
14468                     # but since they are left justified, a blank will be
14469                     # inserted in the normal case.
14470                     $FILLER - 1,
14471                     $flags,
14472
14473                     # The other * in the format is replaced by this number to
14474                     # cause the first main column to right fill with blanks.
14475                     # The -1 is for the guaranteed blank following it.
14476                     $first_column_width - $FILLER - 1,
14477                     $entry,
14478                     $info;
14479 }
14480
14481 my @zero_match_tables;  # List of tables that have no matches in this release
14482
14483 sub make_re_pod_entries($) {
14484     # This generates the entries for the pod file for a given table.
14485     # Also done at this time are any children tables.  The output looks like:
14486     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
14487
14488     my $input_table = shift;        # Table the entry is for
14489     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14490
14491     # Generate parent and all its children at the same time.
14492     return if $input_table->parent != $input_table;
14493
14494     my $property = $input_table->property;
14495     my $type = $property->type;
14496     my $full_name = $property->full_name;
14497
14498     my $count = $input_table->count;
14499     my $string_count = clarify_number($count);
14500     my $status = $input_table->status;
14501     my $status_info = $input_table->status_info;
14502     my $caseless_equivalent = $input_table->caseless_equivalent;
14503
14504     # Don't mention a placeholder equivalent as it isn't to be listed in the
14505     # pod
14506     $caseless_equivalent = 0 if $caseless_equivalent != 0
14507                                 && $caseless_equivalent->fate > $ORDINARY;
14508
14509     my $entry_for_first_table; # The entry for the first table output.
14510                            # Almost certainly, it is the parent.
14511
14512     # For each related table (including itself), we will generate a pod entry
14513     # for each name each table goes by
14514     foreach my $table ($input_table, $input_table->children) {
14515
14516         # utf8_heavy.pl cannot deal with null string property values, so skip
14517         # any tables that have no non-null names.
14518         next if ! grep { $_->name ne "" } $table->aliases;
14519
14520         # First, gather all the info that applies to this table as a whole.
14521
14522         push @zero_match_tables, $table if $count == 0
14523                                             # Don't mention special tables
14524                                             # as being zero length
14525                                            && $table->fate == $ORDINARY;
14526
14527         my $table_property = $table->property;
14528
14529         # The short name has all the underscores removed, while the full name
14530         # retains them.  Later, we decide whether to output a short synonym
14531         # for the full one, we need to compare apples to apples, so we use the
14532         # short name's length including underscores.
14533         my $table_property_short_name_length;
14534         my $table_property_short_name
14535             = $table_property->short_name(\$table_property_short_name_length);
14536         my $table_property_full_name = $table_property->full_name;
14537
14538         # Get how much savings there is in the short name over the full one
14539         # (delta will always be <= 0)
14540         my $table_property_short_delta = $table_property_short_name_length
14541                                          - length($table_property_full_name);
14542         my @table_description = $table->description;
14543         my @table_note = $table->note;
14544
14545         # Generate an entry for each alias in this table.
14546         my $entry_for_first_alias;  # saves the first one encountered.
14547         foreach my $alias ($table->aliases) {
14548
14549             # Skip if not to go in pod.
14550             next unless $alias->make_re_pod_entry;
14551
14552             # Start gathering all the components for the entry
14553             my $name = $alias->name;
14554
14555             # Skip if name is empty, as can't be accessed by regexes.
14556             next if $name eq "";
14557
14558             my $entry;      # Holds the left column, may include extras
14559             my $entry_ref;  # To refer to the left column's contents from
14560                             # another entry; has no extras
14561
14562             # First the left column of the pod entry.  Tables for the $perl
14563             # property always use the single form.
14564             if ($table_property == $perl) {
14565                 $entry = "\\p{$name}";
14566                 $entry_ref = "\\p{$name}";
14567             }
14568             else {    # Compound form.
14569
14570                 # Only generate one entry for all the aliases that mean true
14571                 # or false in binary properties.  Append a '*' to indicate
14572                 # some are missing.  (The heading comment notes this.)
14573                 my $rhs;
14574                 if ($type == $BINARY) {
14575                     next if $name ne 'N' && $name ne 'Y';
14576                     $rhs = "$name*";
14577                 }
14578                 elsif ($type != $FORCED_BINARY) {
14579                     $rhs = $name;
14580                 }
14581                 else {
14582
14583                     # Forced binary properties require special handling.  It
14584                     # has two sets of tables, one set is true/false; and the
14585                     # other set is everything else.  Entries are generated for
14586                     # each set.  Use the Bidi_Mirrored property (which appears
14587                     # in all Unicode versions) to get a list of the aliases
14588                     # for the true/false tables.  Of these, only output the N
14589                     # and Y ones, the same as, a regular binary property.  And
14590                     # output all the rest, same as a non-binary property.
14591                     my $bm = property_ref("Bidi_Mirrored");
14592                     if ($name eq 'N' || $name eq 'Y') {
14593                         $rhs = "$name*";
14594                     } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
14595                                                         $bm->table("N")->aliases)
14596                     {
14597                         next;
14598                     }
14599                     else {
14600                         $rhs = $name;
14601                     }
14602                 }
14603
14604                 # Colon-space is used to give a little more space to be easier
14605                 # to read;
14606                 $entry = "\\p{"
14607                         . $table_property_full_name
14608                         . ": $rhs}";
14609
14610                 # But for the reference to this entry, which will go in the
14611                 # right column, where space is at a premium, use equals
14612                 # without a space
14613                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
14614             }
14615
14616             # Then the right (info) column.  This is stored as components of
14617             # an array for the moment, then joined into a string later.  For
14618             # non-internal only properties, begin the info with the entry for
14619             # the first table we encountered (if any), as things are ordered
14620             # so that that one is the most descriptive.  This leads to the
14621             # info column of an entry being a more descriptive version of the
14622             # name column
14623             my @info;
14624             if ($name =~ /^_/) {
14625                 push @info,
14626                         '(For internal use by Perl, not necessarily stable)';
14627             }
14628             elsif ($entry_for_first_alias) {
14629                 push @info, $entry_for_first_alias;
14630             }
14631
14632             # If this entry is equivalent to another, add that to the info,
14633             # using the first such table we encountered
14634             if ($entry_for_first_table) {
14635                 if (@info) {
14636                     push @info, "(= $entry_for_first_table)";
14637                 }
14638                 else {
14639                     push @info, $entry_for_first_table;
14640                 }
14641             }
14642
14643             # If the name is a large integer, add an equivalent with an
14644             # exponent for better readability
14645             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
14646                 push @info, sprintf "(= %.1e)", $name
14647             }
14648
14649             my $parenthesized = "";
14650             if (! $entry_for_first_alias) {
14651
14652                 # This is the first alias for the current table.  The alias
14653                 # array is ordered so that this is the fullest, most
14654                 # descriptive alias, so it gets the fullest info.  The other
14655                 # aliases are mostly merely pointers to this one, using the
14656                 # information already added above.
14657
14658                 # Display any status message, but only on the parent table
14659                 if ($status && ! $entry_for_first_table) {
14660                     push @info, $status_info;
14661                 }
14662
14663                 # Put out any descriptive info
14664                 if (@table_description || @table_note) {
14665                     push @info, join "; ", @table_description, @table_note;
14666                 }
14667
14668                 # Look to see if there is a shorter name we can point people
14669                 # at
14670                 my $standard_name = standardize($name);
14671                 my $short_name;
14672                 my $proposed_short = $table->short_name;
14673                 if (defined $proposed_short) {
14674                     my $standard_short = standardize($proposed_short);
14675
14676                     # If the short name is shorter than the standard one, or
14677                     # even it it's not, but the combination of it and its
14678                     # short property name (as in \p{prop=short} ($perl doesn't
14679                     # have this form)) saves at least two characters, then,
14680                     # cause it to be listed as a shorter synonym.
14681                     if (length $standard_short < length $standard_name
14682                         || ($table_property != $perl
14683                             && (length($standard_short)
14684                                 - length($standard_name)
14685                                 + $table_property_short_delta)  # (<= 0)
14686                                 < -2))
14687                     {
14688                         $short_name = $proposed_short;
14689                         if ($table_property != $perl) {
14690                             $short_name = $table_property_short_name
14691                                           . "=$short_name";
14692                         }
14693                         $short_name = "\\p{$short_name}";
14694                     }
14695                 }
14696
14697                 # And if this is a compound form name, see if there is a
14698                 # single form equivalent
14699                 my $single_form;
14700                 if ($table_property != $perl) {
14701
14702                     # Special case the binary N tables, so that will print
14703                     # \P{single}, but use the Y table values to populate
14704                     # 'single', as we haven't likewise populated the N table.
14705                     # For forced binary tables, we can't just look at the N
14706                     # table, but must see if this table is equivalent to the N
14707                     # one, as there are two equivalent beasts in these
14708                     # properties.
14709                     my $test_table;
14710                     my $p;
14711                     if (   ($type == $BINARY
14712                             && $input_table == $property->table('No'))
14713                         || ($type == $FORCED_BINARY
14714                             && $property->table('No')->
14715                                         is_set_equivalent_to($input_table)))
14716                     {
14717                         $test_table = $property->table('Yes');
14718                         $p = 'P';
14719                     }
14720                     else {
14721                         $test_table = $input_table;
14722                         $p = 'p';
14723                     }
14724
14725                     # Look for a single form amongst all the children.
14726                     foreach my $table ($test_table->children) {
14727                         next if $table->property != $perl;
14728                         my $proposed_name = $table->short_name;
14729                         next if ! defined $proposed_name;
14730
14731                         # Don't mention internal-only properties as a possible
14732                         # single form synonym
14733                         next if substr($proposed_name, 0, 1) eq '_';
14734
14735                         $proposed_name = "\\$p\{$proposed_name}";
14736                         if (! defined $single_form
14737                             || length($proposed_name) < length $single_form)
14738                         {
14739                             $single_form = $proposed_name;
14740
14741                             # The goal here is to find a single form; not the
14742                             # shortest possible one.  We've already found a
14743                             # short name.  So, stop at the first single form
14744                             # found, which is likely to be closer to the
14745                             # original.
14746                             last;
14747                         }
14748                     }
14749                 }
14750
14751                 # Ouput both short and single in the same parenthesized
14752                 # expression, but with only one of 'Single', 'Short' if there
14753                 # are both items.
14754                 if ($short_name || $single_form || $table->conflicting) {
14755                     $parenthesized .= "Short: $short_name" if $short_name;
14756                     if ($short_name && $single_form) {
14757                         $parenthesized .= ', ';
14758                     }
14759                     elsif ($single_form) {
14760                         $parenthesized .= 'Single: ';
14761                     }
14762                     $parenthesized .= $single_form if $single_form;
14763                 }
14764             }
14765
14766             if ($caseless_equivalent != 0) {
14767                 $parenthesized .=  '; ' if $parenthesized ne "";
14768                 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
14769             }
14770
14771
14772             # Warn if this property isn't the same as one that a
14773             # semi-casual user might expect.  The other components of this
14774             # parenthesized structure are calculated only for the first entry
14775             # for this table, but the conflicting is deemed important enough
14776             # to go on every entry.
14777             my $conflicting = join " NOR ", $table->conflicting;
14778             if ($conflicting) {
14779                 $parenthesized .=  '; ' if $parenthesized ne "";
14780                 $parenthesized .= "NOT $conflicting";
14781             }
14782
14783             push @info, "($parenthesized)" if $parenthesized;
14784
14785             if ($name =~ /_$/ && $alias->loose_match) {
14786                 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
14787             }
14788
14789             if ($table_property != $perl && $table->perl_extension) {
14790                 push @info, '(Perl extension)';
14791             }
14792             push @info, "($string_count)";
14793
14794             # Now, we have both the entry and info so add them to the
14795             # list of all the properties.
14796             push @match_properties,
14797                 format_pod_line($indent_info_column,
14798                                 $entry,
14799                                 join( " ", @info),
14800                                 $alias->status,
14801                                 $alias->loose_match);
14802
14803             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
14804         } # End of looping through the aliases for this table.
14805
14806         if (! $entry_for_first_table) {
14807             $entry_for_first_table = $entry_for_first_alias;
14808         }
14809     } # End of looping through all the related tables
14810     return;
14811 }
14812
14813 sub make_ucd_table_pod_entries {
14814     my $table = shift;
14815
14816     # Generate the entries for the UCD section of the pod for $table.  This
14817     # also calculates if names are ambiguous, so has to be called even if the
14818     # pod is not being output
14819
14820     my $short_name = $table->name;
14821     my $standard_short_name = standardize($short_name);
14822     my $full_name = $table->full_name;
14823     my $standard_full_name = standardize($full_name);
14824
14825     my $full_info = "";     # Text of info column for full-name entries
14826     my $other_info = "";    # Text of info column for short-name entries
14827     my $short_info = "";    # Text of info column for other entries
14828     my $meaning = "";       # Synonym of this table
14829
14830     my $property = ($table->isa('Property'))
14831                    ? $table
14832                    : $table->parent->property;
14833
14834     my $perl_extension = $table->perl_extension;
14835
14836     # Get the more official name for for perl extensions that aren't
14837     # stand-alone properties
14838     if ($perl_extension && $property != $table) {
14839         if ($property == $perl ||$property->type == $BINARY) {
14840             $meaning = $table->complete_name;
14841         }
14842         else {
14843             $meaning = $property->full_name . "=$full_name";
14844         }
14845     }
14846
14847     # There are three types of info column.  One for the short name, one for
14848     # the full name, and one for everything else.  They mostly are the same,
14849     # so initialize in the same loop.
14850     foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
14851         if ($perl_extension && $property != $table) {
14852
14853             # Add the synonymous name for the non-full name entries; and to
14854             # the full-name entry if it adds extra information
14855             if ($info_ref == \$other_info
14856                 || ($info_ref == \$short_info
14857                     && $standard_short_name ne $standard_full_name)
14858                 || standardize($meaning) ne $standard_full_name
14859             ) {
14860                 $$info_ref .= "$meaning.";
14861             }
14862         }
14863         elsif ($info_ref != \$full_info) {
14864
14865             # Otherwise, the non-full name columns include the full name
14866             $$info_ref .= $full_name;
14867         }
14868
14869         # And the full-name entry includes the short name, if different
14870         if ($info_ref == \$full_info
14871             && $standard_short_name ne $standard_full_name)
14872         {
14873             $full_info =~ s/\.\Z//;
14874             $full_info .= "  " if $full_info;
14875             $full_info .= "(Short: $short_name)";
14876         }
14877
14878         if ($table->perl_extension) {
14879             $$info_ref =~ s/\.\Z//;
14880             $$info_ref .= ".  " if $$info_ref;
14881             $$info_ref .= "(Perl extension)";
14882         }
14883     }
14884
14885     # Add any extra annotations to the full name entry
14886     foreach my $more_info ($table->description,
14887                             $table->note,
14888                             $table->status_info)
14889     {
14890         next unless $more_info;
14891         $full_info =~ s/\.\Z//;
14892         $full_info .= ".  " if $full_info;
14893         $full_info .= $more_info;
14894     }
14895
14896     # These keep track if have created full and short name pod entries for the
14897     # property
14898     my $done_full = 0;
14899     my $done_short = 0;
14900
14901     # Every possible name is kept track of, even those that aren't going to be
14902     # output.  This way we can be sure to find the ambiguities.
14903     foreach my $alias ($table->aliases) {
14904         my $name = $alias->name;
14905         my $standard = standardize($name);
14906         my $info;
14907         my $output_this = $alias->ucd;
14908
14909         # If the full and short names are the same, we want to output the full
14910         # one's entry, so it has priority.
14911         if ($standard eq $standard_full_name) {
14912             next if $done_full;
14913             $done_full = 1;
14914             $info = $full_info;
14915         }
14916         elsif ($standard eq $standard_short_name) {
14917             next if $done_short;
14918             $done_short = 1;
14919             next if $standard_short_name eq $standard_full_name;
14920             $info = $short_info;
14921         }
14922         else {
14923             $info = $other_info;
14924         }
14925
14926         # Here, we have set up the two columns for this entry.  But if an
14927         # entry already exists for this name, we have to decide which one
14928         # we're going to later output.
14929         if (exists $ucd_pod{$standard}) {
14930
14931             # If the two entries refer to the same property, it's not going to
14932             # be ambiguous.  (Likely it's because the names when standardized
14933             # are the same.)  But that means if they are different properties,
14934             # there is ambiguity.
14935             if ($ucd_pod{$standard}->{'property'} != $property) {
14936
14937                 # Here, we have an ambiguity.  This code assumes that one is
14938                 # scheduled to be output and one not and that one is a perl
14939                 # extension (which is not to be output) and the other isn't.
14940                 # If those assumptions are wrong, things have to be rethought.
14941                 if ($ucd_pod{$standard}{'output_this'} == $output_this
14942                     || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
14943                     || $output_this == $perl_extension)
14944                 {
14945                     Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
14946                 }
14947
14948                 # We modifiy the info column of the one being output to
14949                 # indicate the ambiguity.  Set $which to point to that one's
14950                 # info.
14951                 my $which;
14952                 if ($ucd_pod{$standard}{'output_this'}) {
14953                     $which = \$ucd_pod{$standard}->{'info'};
14954                 }
14955                 else {
14956                     $which = \$info;
14957                     $meaning = $ucd_pod{$standard}{'meaning'};
14958                 }
14959
14960                 chomp $$which;
14961                 $$which =~ s/\.\Z//;
14962                 $$which .= "; NOT '$standard' meaning '$meaning'";
14963
14964                 $ambiguous_names{$standard} = 1;
14965             }
14966
14967             # Use the non-perl-extension variant
14968             next unless $ucd_pod{$standard}{'perl_extension'};
14969         }
14970
14971         # Store enough information about this entry that we can later look for
14972         # ambiguities, and output it properly.
14973         $ucd_pod{$standard} = { 'name' => $name,
14974                                 'info' => $info,
14975                                 'meaning' => $meaning,
14976                                 'output_this' => $output_this,
14977                                 'perl_extension' => $perl_extension,
14978                                 'property' => $property,
14979                                 'status' => $alias->status,
14980         };
14981     } # End of looping through all this table's aliases
14982
14983     return;
14984 }
14985
14986 sub pod_alphanumeric_sort {
14987     # Sort pod entries alphanumerically.
14988
14989     # The first few character columns are filler, plus the '\p{'; and get rid
14990     # of all the trailing stuff, starting with the trailing '}', so as to sort
14991     # on just 'Name=Value'
14992     (my $a = lc $a) =~ s/^ .*? { //x;
14993     $a =~ s/}.*//;
14994     (my $b = lc $b) =~ s/^ .*? { //x;
14995     $b =~ s/}.*//;
14996
14997     # Determine if the two operands are both internal only or both not.
14998     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
14999     # should be the underscore that begins internal only
15000     my $a_is_internal = (substr($a, 0, 1) eq '_');
15001     my $b_is_internal = (substr($b, 0, 1) eq '_');
15002
15003     # Sort so the internals come last in the table instead of first (which the
15004     # leading underscore would otherwise indicate).
15005     if ($a_is_internal != $b_is_internal) {
15006         return 1 if $a_is_internal;
15007         return -1
15008     }
15009
15010     # Determine if the two operands are numeric property values or not.
15011     # A numeric property will look like xyz: 3.  But the number
15012     # can begin with an optional minus sign, and may have a
15013     # fraction or rational component, like xyz: 3/2.  If either
15014     # isn't numeric, use alphabetic sort.
15015     my ($a_initial, $a_number) =
15016         ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15017     return $a cmp $b unless defined $a_number;
15018     my ($b_initial, $b_number) =
15019         ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15020     return $a cmp $b unless defined $b_number;
15021
15022     # Here they are both numeric, but use alphabetic sort if the
15023     # initial parts don't match
15024     return $a cmp $b if $a_initial ne $b_initial;
15025
15026     # Convert rationals to floating for the comparison.
15027     $a_number = eval $a_number if $a_number =~ qr{/};
15028     $b_number = eval $b_number if $b_number =~ qr{/};
15029
15030     return $a_number <=> $b_number;
15031 }
15032
15033 sub make_pod () {
15034     # Create the .pod file.  This generates the various subsections and then
15035     # combines them in one big HERE document.
15036
15037     my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
15038
15039     return unless defined $pod_directory;
15040     print "Making pod file\n" if $verbosity >= $PROGRESS;
15041
15042     my $exception_message =
15043     '(Any exceptions are individually noted beginning with the word NOT.)';
15044     my @block_warning;
15045     if (-e 'Blocks.txt') {
15046
15047         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
15048         # if the global $has_In_conflicts indicates we have them.
15049         push @match_properties, format_pod_line($indent_info_column,
15050                                                 '\p{In_*}',
15051                                                 '\p{Block: *}'
15052                                                     . (($has_In_conflicts)
15053                                                       ? " $exception_message"
15054                                                       : ""));
15055         @block_warning = << "END";
15056
15057 Matches in the Block property have shortcuts that begin with "In_".  For
15058 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>.  For
15059 backward compatibility, if there is no conflict with another shortcut, these
15060 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>.  But, N.B., there
15061 are numerous such conflicting shortcuts.  Use of these forms for Block is
15062 discouraged, and are flagged as such, not only because of the potential
15063 confusion as to what is meant, but also because a later release of Unicode may
15064 preempt the shortcut, and your program would no longer be correct.  Use the
15065 "In_" form instead to avoid this, or even more clearly, use the compound form,
15066 e.g., C<\\p{blk:latin1}>.  See L<perlunicode/"Blocks"> for more information
15067 about this.
15068 END
15069     }
15070     my $text = $Is_flags_text;
15071     $text = "$exception_message $text" if $has_Is_conflicts;
15072
15073     # And the 'Is_ line';
15074     push @match_properties, format_pod_line($indent_info_column,
15075                                             '\p{Is_*}',
15076                                             "\\p{*} $text");
15077
15078     # Sort the properties array for output.  It is sorted alphabetically
15079     # except numerically for numeric properties, and only output unique lines.
15080     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
15081
15082     my $formatted_properties = simple_fold(\@match_properties,
15083                                         "",
15084                                         # indent succeeding lines by two extra
15085                                         # which looks better
15086                                         $indent_info_column + 2,
15087
15088                                         # shorten the line length by how much
15089                                         # the formatter indents, so the folded
15090                                         # line will fit in the space
15091                                         # presumably available
15092                                         $automatic_pod_indent);
15093     # Add column headings, indented to be a little more centered, but not
15094     # exactly
15095     $formatted_properties =  format_pod_line($indent_info_column,
15096                                                     '    NAME',
15097                                                     '           INFO')
15098                                     . "\n"
15099                                     . $formatted_properties;
15100
15101     # Generate pod documentation lines for the tables that match nothing
15102     my $zero_matches = "";
15103     if (@zero_match_tables) {
15104         @zero_match_tables = uniques(@zero_match_tables);
15105         $zero_matches = join "\n\n",
15106                         map { $_ = '=item \p{' . $_->complete_name . "}" }
15107                             sort { $a->complete_name cmp $b->complete_name }
15108                             @zero_match_tables;
15109
15110         $zero_matches = <<END;
15111
15112 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
15113
15114 Unicode has some property-value pairs that currently don't match anything.
15115 This happens generally either because they are obsolete, or they exist for
15116 symmetry with other forms, but no language has yet been encoded that uses
15117 them.  In this version of Unicode, the following match zero code points:
15118
15119 =over 4
15120
15121 $zero_matches
15122
15123 =back
15124
15125 END
15126     }
15127
15128     # Generate list of properties that we don't accept, grouped by the reasons
15129     # why.  This is so only put out the 'why' once, and then list all the
15130     # properties that have that reason under it.
15131
15132     my %why_list;   # The keys are the reasons; the values are lists of
15133                     # properties that have the key as their reason
15134
15135     # For each property, add it to the list that are suppressed for its reason
15136     # The sort will cause the alphabetically first properties to be added to
15137     # each list first, so each list will be sorted.
15138     foreach my $property (sort keys %why_suppressed) {
15139         push @{$why_list{$why_suppressed{$property}}}, $property;
15140     }
15141
15142     # For each reason (sorted by the first property that has that reason)...
15143     my @bad_re_properties;
15144     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
15145                      keys %why_list)
15146     {
15147         # Add to the output, all the properties that have that reason.
15148         my $has_item = 0;   # Flag if actually output anything.
15149         foreach my $name (@{$why_list{$why}}) {
15150
15151             # Split compound names into $property and $table components
15152             my $property = $name;
15153             my $table;
15154             if ($property =~ / (.*) = (.*) /x) {
15155                 $property = $1;
15156                 $table = $2;
15157             }
15158
15159             # This release of Unicode may not have a property that is
15160             # suppressed, so don't reference a non-existent one.
15161             $property = property_ref($property);
15162             next if ! defined $property;
15163
15164             # And since this list is only for match tables, don't list the
15165             # ones that don't have match tables.
15166             next if ! $property->to_create_match_tables;
15167
15168             # Find any abbreviation, and turn it into a compound name if this
15169             # is a property=value pair.
15170             my $short_name = $property->name;
15171             $short_name .= '=' . $property->table($table)->name if $table;
15172
15173             # Start with an empty line.
15174             push @bad_re_properties, "\n\n" unless $has_item;
15175
15176             # And add the property as an item for the reason.
15177             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
15178             $has_item = 1;
15179         }
15180
15181         # And add the reason under the list of properties, if such a list
15182         # actually got generated.  Note that the header got added
15183         # unconditionally before.  But pod ignores extra blank lines, so no
15184         # harm.
15185         push @bad_re_properties, "\n$why\n" if $has_item;
15186
15187     } # End of looping through each reason.
15188
15189     if (! @bad_re_properties) {
15190         push @bad_re_properties,
15191                 "*** This installation accepts ALL non-Unihan properties ***";
15192     }
15193     else {
15194         # Add =over only if non-empty to avoid an empty =over/=back section,
15195         # which is considered bad form.
15196         unshift @bad_re_properties, "\n=over 4\n";
15197         push @bad_re_properties, "\n=back\n";
15198     }
15199
15200     # Similiarly, generate a list of files that we don't use, grouped by the
15201     # reasons why.  First, create a hash whose keys are the reasons, and whose
15202     # values are anonymous arrays of all the files that share that reason.
15203     my %grouped_by_reason;
15204     foreach my $file (keys %ignored_files) {
15205         push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
15206     }
15207     foreach my $file (keys %skipped_files) {
15208         push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
15209     }
15210
15211     # Then, sort each group.
15212     foreach my $group (keys %grouped_by_reason) {
15213         @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
15214                                         @{$grouped_by_reason{$group}} ;
15215     }
15216
15217     # Finally, create the output text.  For each reason (sorted by the
15218     # alphabetically first file that has that reason)...
15219     my @unused_files;
15220     foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
15221                                cmp lc $grouped_by_reason{$b}->[0]
15222                               }
15223                          keys %grouped_by_reason)
15224     {
15225         # Add all the files that have that reason to the output.  Start
15226         # with an empty line.
15227         push @unused_files, "\n\n";
15228         push @unused_files, map { "\n=item F<$_> \n" }
15229                             @{$grouped_by_reason{$reason}};
15230         # And add the reason under the list of files
15231         push @unused_files, "\n$reason\n";
15232     }
15233
15234     # Similarly, create the output text for the UCD section of the pod
15235     my @ucd_pod;
15236     foreach my $key (keys %ucd_pod) {
15237         next unless $ucd_pod{$key}->{'output_this'};
15238         push @ucd_pod, format_pod_line($indent_info_column,
15239                                        $ucd_pod{$key}->{'name'},
15240                                        $ucd_pod{$key}->{'info'},
15241                                        $ucd_pod{$key}->{'status'},
15242                                       );
15243     }
15244
15245     # Sort alphabetically, and fold for output
15246     @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
15247     my $ucd_pod = simple_fold(\@ucd_pod,
15248                            ' ',
15249                            $indent_info_column,
15250                            $automatic_pod_indent);
15251     $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
15252                 . "\n"
15253                 . $ucd_pod;
15254     local $" = "";
15255
15256     # Everything is ready to assemble.
15257     my @OUT = << "END";
15258 =begin comment
15259
15260 $HEADER
15261
15262 To change this file, edit $0 instead.
15263
15264 =end comment
15265
15266 =head1 NAME
15267
15268 $pod_file - Index of Unicode Version $string_version character properties in Perl
15269
15270 =head1 DESCRIPTION
15271
15272 This document provides information about the portion of the Unicode database
15273 that deals with character properties, that is the portion that is defined on
15274 single code points.  (L</Other information in the Unicode data base>
15275 below briefly mentions other data that Unicode provides.)
15276
15277 Perl can provide access to all non-provisional Unicode character properties,
15278 though not all are enabled by default.  The omitted ones are the Unihan
15279 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
15280 deprecated or Unicode-internal properties.  (An installation may choose to
15281 recompile Perl's tables to change this.  See L<Unicode character
15282 properties that are NOT accepted by Perl>.)
15283
15284 For most purposes, access to Unicode properties from the Perl core is through
15285 regular expression matches, as described in the next section.
15286 For some special purposes, and to access the properties that are not suitable
15287 for regular expression matching, all the Unicode character properties that
15288 Perl handles are accessible via the standard L<Unicode::UCD> module, as
15289 described in the section L</Properties accessible through Unicode::UCD>.
15290
15291 Perl also provides some additional extensions and short-cut synonyms
15292 for Unicode properties.
15293
15294 This document merely lists all available properties and does not attempt to
15295 explain what each property really means.  There is a brief description of each
15296 Perl extension; see L<perlunicode/Other Properties> for more information on
15297 these.  There is some detail about Blocks, Scripts, General_Category,
15298 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
15299 official Unicode properties, refer to the Unicode standard.  A good starting
15300 place is L<$unicode_reference_url>.
15301
15302 Note that you can define your own properties; see
15303 L<perlunicode/"User-Defined Character Properties">.
15304
15305 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
15306
15307 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
15308 most of the Unicode character properties.  The table below shows all these
15309 constructs, both single and compound forms.
15310
15311 B<Compound forms> consist of two components, separated by an equals sign or a
15312 colon.  The first component is the property name, and the second component is
15313 the particular value of the property to match against, for example,
15314 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
15315 whose Script property is Greek.
15316
15317 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
15318 their equivalent compound forms.  The table shows these equivalences.  (In our
15319 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
15320 There are also a few Perl-defined single forms that are not shortcuts for a
15321 compound form.  One such is C<\\p{Word}>.  These are also listed in the table.
15322
15323 In parsing these constructs, Perl always ignores Upper/lower case differences
15324 everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
15325 C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
15326 the left brace completely changes the meaning of the construct, from "match"
15327 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
15328 for improved legibility.
15329
15330 Also, white space, hyphens, and underscores are normally ignored
15331 everywhere between the {braces}, and hence can be freely added or removed
15332 even if the C</x> modifier hasn't been specified on the regular expression.
15333 But $a_bold_stricter at the beginning of an entry in the table below
15334 means that tighter (stricter) rules are used for that entry:
15335
15336 =over 4
15337
15338 =item Single form (C<\\p{name}>) tighter rules:
15339
15340 White space, hyphens, and underscores ARE significant
15341 except for:
15342
15343 =over 4
15344
15345 =item * white space adjacent to a non-word character
15346
15347 =item * underscores separating digits in numbers
15348
15349 =back
15350
15351 That means, for example, that you can freely add or remove white space
15352 adjacent to (but within) the braces without affecting the meaning.
15353
15354 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
15355
15356 The tighter rules given above for the single form apply to everything to the
15357 right of the colon or equals; the looser rules still apply to everything to
15358 the left.
15359
15360 That means, for example, that you can freely add or remove white space
15361 adjacent to (but within) the braces and the colon or equal sign.
15362
15363 =back
15364
15365 Some properties are considered obsolete by Unicode, but still available.
15366 There are several varieties of obsolescence:
15367
15368 =over 4
15369
15370 =item Stabilized
15371
15372 A property may be stabilized.  Such a determination does not indicate
15373 that the property should or should not be used; instead it is a declaration
15374 that the property will not be maintained nor extended for newly encoded
15375 characters.  Such properties are marked with $a_bold_stabilized in the
15376 table.
15377
15378 =item Deprecated
15379
15380 A property may be deprecated, perhaps because its original intent
15381 has been replaced by another property, or because its specification was
15382 somehow defective.  This means that its use is strongly
15383 discouraged, so much so that a warning will be issued if used, unless the
15384 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
15385 statement.  $A_bold_deprecated flags each such entry in the table, and
15386 the entry there for the longest, most descriptive version of the property will
15387 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
15388 warning, even for properties that aren't officially deprecated by Unicode,
15389 when there used to be characters or code points that were matched by them, but
15390 no longer.  This is to warn you that your program may not work like it did on
15391 earlier Unicode releases.
15392
15393 A deprecated property may be made unavailable in a future Perl version, so it
15394 is best to move away from them.
15395
15396 A deprecated property may also be stabilized, but this fact is not shown.
15397
15398 =item Obsolete
15399
15400 Properties marked with $a_bold_obsolete in the table are considered (plain)
15401 obsolete.  Generally this designation is given to properties that Unicode once
15402 used for internal purposes (but not any longer).
15403
15404 =back
15405
15406 Some Perl extensions are present for backwards compatibility and are
15407 discouraged from being used, but are not obsolete.  $A_bold_discouraged
15408 flags each such entry in the table.  Future Unicode versions may force
15409 some of these extensions to be removed without warning, replaced by another
15410 property with the same name that means something different.  Use the
15411 equivalent shown instead.
15412
15413 @block_warning
15414
15415 The table below has two columns.  The left column contains the C<\\p{}>
15416 constructs to look up, possibly preceded by the flags mentioned above; and
15417 the right column contains information about them, like a description, or
15418 synonyms.  It shows both the single and compound forms for each property that
15419 has them.  If the left column is a short name for a property, the right column
15420 will give its longer, more descriptive name; and if the left column is the
15421 longest name, the right column will show any equivalent shortest name, in both
15422 single and compound forms if applicable.
15423
15424 The right column will also caution you if a property means something different
15425 than what might normally be expected.
15426
15427 All single forms are Perl extensions; a few compound forms are as well, and
15428 are noted as such.
15429
15430 Numbers in (parentheses) indicate the total number of code points matched by
15431 the property.  For emphasis, those properties that match no code points at all
15432 are listed as well in a separate section following the table.
15433
15434 Most properties match the same code points regardless of whether C<"/i">
15435 case-insensitive matching is specified or not.  But a few properties are
15436 affected.  These are shown with the notation
15437
15438  (/i= other_property)
15439
15440 in the second column.  Under case-insensitive matching they match the
15441 same code pode points as the property "other_property".
15442
15443 There is no description given for most non-Perl defined properties (See
15444 L<$unicode_reference_url> for that).
15445
15446 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
15447 combinations.  For example, entries like:
15448
15449  \\p{Gc: *}                                  \\p{General_Category: *}
15450
15451 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
15452 for the latter is also valid for the former.  Similarly,
15453
15454  \\p{Is_*}                                   \\p{*}
15455
15456 means that if and only if, for example, C<\\p{Foo}> exists, then
15457 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
15458 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
15459 C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
15460 underscore.
15461
15462 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
15463 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
15464 'N*' to indicate this, and doesn't have separate entries for the other
15465 possibilities.  Note that not all properties which have values 'Yes' and 'No'
15466 are binary, and they have all their values spelled out without using this wild
15467 card, and a C<NOT> clause in their description that highlights their not being
15468 binary.  These also require the compound form to match them, whereas true
15469 binary properties have both single and compound forms available.
15470
15471 Note that all non-essential underscores are removed in the display of the
15472 short names below.
15473
15474 B<Legend summary:>
15475
15476 =over 4
15477
15478 =item Z<>B<*> is a wild-card
15479
15480 =item B<(\\d+)> in the info column gives the number of code points matched by
15481 this property.
15482
15483 =item B<$DEPRECATED> means this is deprecated.
15484
15485 =item B<$OBSOLETE> means this is obsolete.
15486
15487 =item B<$STABILIZED> means this is stabilized.
15488
15489 =item B<$STRICTER> means tighter (stricter) name matching applies.
15490
15491 =item B<$DISCOURAGED> means use of this form is discouraged, and may not be
15492 stable.
15493
15494 =back
15495
15496 $formatted_properties
15497
15498 $zero_matches
15499
15500 =head1 Properties accessible through Unicode::UCD
15501
15502 All the Unicode character properties mentioned above (except for those marked
15503 as for internal use by Perl) are also accessible by
15504 L<Unicode::UCD/prop_invlist()>.
15505
15506 Due to their nature, not all Unicode character properties are suitable for
15507 regular expression matches, nor C<prop_invlist()>.  The remaining
15508 non-provisional, non-internal ones are accessible via
15509 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
15510 hasn't included; see L<below for which those are|/Unicode character properties
15511 that are NOT accepted by Perl>).
15512
15513 For compatibility with other parts of Perl, all the single forms given in the
15514 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
15515 are recognized.  BUT, there are some ambiguities between some Perl extensions
15516 and the Unicode properties, all of which are silently resolved in favor of the
15517 official Unicode property.  To avoid surprises, you should only use
15518 C<prop_invmap()> for forms listed in the table below, which omits the
15519 non-recommended ones.  The affected forms are the Perl single form equivalents
15520 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
15521 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
15522 whose short name is C<sc>.  The table indicates the current ambiguities in the
15523 INFO column, beginning with the word C<"NOT">.
15524
15525 The standard Unicode properties listed below are documented in
15526 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
15527 L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
15528 L<perlunicode/Other Properties>;
15529
15530 The first column in the table is a name for the property; the second column is
15531 an alternative name, if any, plus possibly some annotations.  The alternative
15532 name is the property's full name, unless that would simply repeat the first
15533 column, in which case the second column indicates the property's short name
15534 (if different).  The annotations are given only in the entry for the full
15535 name.  If a property is obsolete, etc, the entry will be flagged with the same
15536 characters used in the table in the L<section above|/Properties accessible
15537 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
15538
15539 $ucd_pod
15540
15541 =head1 Properties accessible through other means
15542
15543 Certain properties are accessible also via core function calls.  These are:
15544
15545  Lowercase_Mapping          lc() and lcfirst()
15546  Titlecase_Mapping          ucfirst()
15547  Uppercase_Mapping          uc()
15548
15549 Also, Case_Folding is accessible through the C</i> modifier in regular
15550 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
15551 operator.
15552
15553 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
15554 interpolation in double-quoted strings and regular expressions; and functions
15555 C<charnames::viacode()>, C<charnames::vianame()>, and
15556 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
15557 specified.
15558
15559 Finally, most properties related to decomposition are accessible via
15560 L<Unicode::Normalize>.
15561
15562 =head1 Unicode character properties that are NOT accepted by Perl
15563
15564 Perl will generate an error for a few character properties in Unicode when
15565 used in a regular expression.  The non-Unihan ones are listed below, with the
15566 reasons they are not accepted, perhaps with work-arounds.  The short names for
15567 the properties are listed enclosed in (parentheses).
15568 As described after the list, an installation can change the defaults and choose
15569 to accept any of these.  The list is machine generated based on the
15570 choices made for the installation that generated this document.
15571
15572 @bad_re_properties
15573
15574 An installation can choose to allow any of these to be matched by downloading
15575 the Unicode database from L<http://www.unicode.org/Public/> to
15576 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
15577 controlling lists contained in the program
15578 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
15579 (C<\%Config> is available from the Config module).
15580
15581 =head1 Other information in the Unicode data base
15582
15583 The Unicode data base is delivered in two different formats.  The XML version
15584 is valid for more modern Unicode releases.  The other version is a collection
15585 of files.  The two are intended to give equivalent information.  Perl uses the
15586 older form; this allows you to recompile Perl to use early Unicode releases.
15587
15588 The only non-character property that Perl currently supports is Named
15589 Sequences, in which a sequence of code points
15590 is given a name and generally treated as a single entity.  (Perl supports
15591 these via the C<\\N{...}> double-quotish construct,
15592 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
15593
15594 Below is a list of the files in the Unicode data base that Perl doesn't
15595 currently use, along with very brief descriptions of their purposes.
15596 Some of the names of the files have been shortened from those that Unicode
15597 uses, in order to allow them to be distinguishable from similarly named files
15598 on file systems for which only the first 8 characters of a name are
15599 significant.
15600
15601 =over 4
15602
15603 @unused_files
15604
15605 =back
15606
15607 =head1 SEE ALSO
15608
15609 L<$unicode_reference_url>
15610
15611 L<perlrecharclass>
15612
15613 L<perlunicode>
15614
15615 END
15616
15617     # And write it.  The 0 means no utf8.
15618     main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
15619     return;
15620 }
15621
15622 sub make_Heavy () {
15623     # Create and write Heavy.pl, which passes info about the tables to
15624     # utf8_heavy.pl
15625
15626     # Stringify structures for output
15627     my $loose_property_name_of
15628                            = simple_dumper(\%loose_property_name_of, ' ' x 4);
15629     chomp $loose_property_name_of;
15630
15631     my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
15632     chomp $stricter_to_file_of;
15633
15634     my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
15635     chomp $loose_to_file_of;
15636
15637     my $nv_floating_to_rational
15638                            = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
15639     chomp $nv_floating_to_rational;
15640
15641     my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
15642     chomp $why_deprecated;
15643
15644     # We set the key to the file when we associated files with tables, but we
15645     # couldn't do the same for the value then, as we might not have the file
15646     # for the alternate table figured out at that time.
15647     foreach my $cased (keys %caseless_equivalent_to) {
15648         my @path = $caseless_equivalent_to{$cased}->file_path;
15649         my $path = join '/', @path[1, -1];
15650         $caseless_equivalent_to{$cased} = $path;
15651     }
15652     my $caseless_equivalent_to
15653                            = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
15654     chomp $caseless_equivalent_to;
15655
15656     my $loose_property_to_file_of
15657                         = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
15658     chomp $loose_property_to_file_of;
15659
15660     my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
15661     chomp $file_to_swash_name;
15662
15663     my @heavy = <<END;
15664 $HEADER
15665 $INTERNAL_ONLY_HEADER
15666
15667 # This file is for the use of utf8_heavy.pl and Unicode::UCD
15668
15669 # Maps Unicode (not Perl single-form extensions) property names in loose
15670 # standard form to their corresponding standard names
15671 \%utf8::loose_property_name_of = (
15672 $loose_property_name_of
15673 );
15674
15675 # Maps property, table to file for those using stricter matching
15676 \%utf8::stricter_to_file_of = (
15677 $stricter_to_file_of
15678 );
15679
15680 # Maps property, table to file for those using loose matching
15681 \%utf8::loose_to_file_of = (
15682 $loose_to_file_of
15683 );
15684
15685 # Maps floating point to fractional form
15686 \%utf8::nv_floating_to_rational = (
15687 $nv_floating_to_rational
15688 );
15689
15690 # If a floating point number doesn't have enough digits in it to get this
15691 # close to a fraction, it isn't considered to be that fraction even if all the
15692 # digits it does have match.
15693 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
15694
15695 # Deprecated tables to generate a warning for.  The key is the file containing
15696 # the table, so as to avoid duplication, as many property names can map to the
15697 # file, but we only need one entry for all of them.
15698 \%utf8::why_deprecated = (
15699 $why_deprecated
15700 );
15701
15702 # A few properties have different behavior under /i matching.  This maps
15703 # those to substitute files to use under /i.
15704 \%utf8::caseless_equivalent = (
15705 $caseless_equivalent_to
15706 );
15707
15708 # Property names to mapping files
15709 \%utf8::loose_property_to_file_of = (
15710 $loose_property_to_file_of
15711 );
15712
15713 # Files to the swash names within them.
15714 \%utf8::file_to_swash_name = (
15715 $file_to_swash_name
15716 );
15717
15718 1;
15719 END
15720
15721     main::write("Heavy.pl", 0, \@heavy);  # The 0 means no utf8.
15722     return;
15723 }
15724
15725 sub make_Name_pm () {
15726     # Create and write Name.pm, which contains subroutines and data to use in
15727     # conjunction with Name.pl
15728
15729     # Maybe there's nothing to do.
15730     return unless $has_hangul_syllables || @code_points_ending_in_code_point;
15731
15732     my @name = <<END;
15733 $HEADER
15734 $INTERNAL_ONLY_HEADER
15735 END
15736
15737     # Convert these structures to output format.
15738     my $code_points_ending_in_code_point =
15739         main::simple_dumper(\@code_points_ending_in_code_point,
15740                             ' ' x 8);
15741     my $names = main::simple_dumper(\%names_ending_in_code_point,
15742                                     ' ' x 8);
15743     my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
15744                                     ' ' x 8);
15745
15746     # Do the same with the Hangul names,
15747     my $jamo;
15748     my $jamo_l;
15749     my $jamo_v;
15750     my $jamo_t;
15751     my $jamo_re;
15752     if ($has_hangul_syllables) {
15753
15754         # Construct a regular expression of all the possible
15755         # combinations of the Hangul syllables.
15756         my @L_re;   # Leading consonants
15757         for my $i ($LBase .. $LBase + $LCount - 1) {
15758             push @L_re, $Jamo{$i}
15759         }
15760         my @V_re;   # Middle vowels
15761         for my $i ($VBase .. $VBase + $VCount - 1) {
15762             push @V_re, $Jamo{$i}
15763         }
15764         my @T_re;   # Trailing consonants
15765         for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
15766             push @T_re, $Jamo{$i}
15767         }
15768
15769         # The whole re is made up of the L V T combination.
15770         $jamo_re = '('
15771                     . join ('|', sort @L_re)
15772                     . ')('
15773                     . join ('|', sort @V_re)
15774                     . ')('
15775                     . join ('|', sort @T_re)
15776                     . ')?';
15777
15778         # These hashes needed by the algorithm were generated
15779         # during reading of the Jamo.txt file
15780         $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
15781         $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
15782         $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
15783         $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
15784     }
15785
15786     push @name, <<END;
15787
15788 package charnames;
15789
15790 # This module contains machine-generated tables and code for the
15791 # algorithmically-determinable Unicode character names.  The following
15792 # routines can be used to translate between name and code point and vice versa
15793
15794 { # Closure
15795
15796     # Matches legal code point.  4-6 hex numbers, If there are 6, the first
15797     # two must be 10; if there are 5, the first must not be a 0.  Written this
15798     # way to decrease backtracking.  The first regex allows the code point to
15799     # be at the end of a word, but to work properly, the word shouldn't end
15800     # with a valid hex character.  The second one won't match a code point at
15801     # the end of a word, and doesn't have the run-on issue
15802     my \$run_on_code_point_re = qr/$run_on_code_point_re/;
15803     my \$code_point_re = qr/$code_point_re/;
15804
15805     # In the following hash, the keys are the bases of names which include
15806     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
15807     # of each key is another hash which is used to get the low and high ends
15808     # for each range of code points that apply to the name.
15809     my %names_ending_in_code_point = (
15810 $names
15811     );
15812
15813     # The following hash is a copy of the previous one, except is for loose
15814     # matching, so each name has blanks and dashes squeezed out
15815     my %loose_names_ending_in_code_point = (
15816 $loose_names
15817     );
15818
15819     # And the following array gives the inverse mapping from code points to
15820     # names.  Lowest code points are first
15821     my \@code_points_ending_in_code_point = (
15822 $code_points_ending_in_code_point
15823     );
15824 END
15825     # Earlier releases didn't have Jamos.  No sense outputting
15826     # them unless will be used.
15827     if ($has_hangul_syllables) {
15828         push @name, <<END;
15829
15830     # Convert from code point to Jamo short name for use in composing Hangul
15831     # syllable names
15832     my %Jamo = (
15833 $jamo
15834     );
15835
15836     # Leading consonant (can be null)
15837     my %Jamo_L = (
15838 $jamo_l
15839     );
15840
15841     # Vowel
15842     my %Jamo_V = (
15843 $jamo_v
15844     );
15845
15846     # Optional trailing consonant
15847     my %Jamo_T = (
15848 $jamo_t
15849     );
15850
15851     # Computed re that splits up a Hangul name into LVT or LV syllables
15852     my \$syllable_re = qr/$jamo_re/;
15853
15854     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
15855     my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
15856
15857     # These constants names and values were taken from the Unicode standard,
15858     # version 5.1, section 3.12.  They are used in conjunction with Hangul
15859     # syllables
15860     my \$SBase = $SBase_string;
15861     my \$LBase = $LBase_string;
15862     my \$VBase = $VBase_string;
15863     my \$TBase = $TBase_string;
15864     my \$SCount = $SCount;
15865     my \$LCount = $LCount;
15866     my \$VCount = $VCount;
15867     my \$TCount = $TCount;
15868     my \$NCount = \$VCount * \$TCount;
15869 END
15870     } # End of has Jamos
15871
15872     push @name, << 'END';
15873
15874     sub name_to_code_point_special {
15875         my ($name, $loose) = @_;
15876
15877         # Returns undef if not one of the specially handled names; otherwise
15878         # returns the code point equivalent to the input name
15879         # $loose is non-zero if to use loose matching, 'name' in that case
15880         # must be input as upper case with all blanks and dashes squeezed out.
15881 END
15882     if ($has_hangul_syllables) {
15883         push @name, << 'END';
15884
15885         if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
15886             || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
15887         {
15888             return if $name !~ qr/^$syllable_re$/;
15889             my $L = $Jamo_L{$1};
15890             my $V = $Jamo_V{$2};
15891             my $T = (defined $3) ? $Jamo_T{$3} : 0;
15892             return ($L * $VCount + $V) * $TCount + $T + $SBase;
15893         }
15894 END
15895     }
15896     push @name, << 'END';
15897
15898         # Name must end in 'code_point' for this to handle.
15899         return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
15900                    || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
15901
15902         my $base = $1;
15903         my $code_point = CORE::hex $2;
15904         my $names_ref;
15905
15906         if ($loose) {
15907             $names_ref = \%loose_names_ending_in_code_point;
15908         }
15909         else {
15910             return if $base !~ s/-$//;
15911             $names_ref = \%names_ending_in_code_point;
15912         }
15913
15914         # Name must be one of the ones which has the code point in it.
15915         return if ! $names_ref->{$base};
15916
15917         # Look through the list of ranges that apply to this name to see if
15918         # the code point is in one of them.
15919         for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
15920             return if $names_ref->{$base}{'low'}->[$i] > $code_point;
15921             next if $names_ref->{$base}{'high'}->[$i] < $code_point;
15922
15923             # Here, the code point is in the range.
15924             return $code_point;
15925         }
15926
15927         # Here, looked like the name had a code point number in it, but
15928         # did not match one of the valid ones.
15929         return;
15930     }
15931
15932     sub code_point_to_name_special {
15933         my $code_point = shift;
15934
15935         # Returns the name of a code point if algorithmically determinable;
15936         # undef if not
15937 END
15938     if ($has_hangul_syllables) {
15939         push @name, << 'END';
15940
15941         # If in the Hangul range, calculate the name based on Unicode's
15942         # algorithm
15943         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
15944             use integer;
15945             my $SIndex = $code_point - $SBase;
15946             my $L = $LBase + $SIndex / $NCount;
15947             my $V = $VBase + ($SIndex % $NCount) / $TCount;
15948             my $T = $TBase + $SIndex % $TCount;
15949             $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
15950             $name .= $Jamo{$T} if $T != $TBase;
15951             return $name;
15952         }
15953 END
15954     }
15955     push @name, << 'END';
15956
15957         # Look through list of these code points for one in range.
15958         foreach my $hash (@code_points_ending_in_code_point) {
15959             return if $code_point < $hash->{'low'};
15960             if ($code_point <= $hash->{'high'}) {
15961                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
15962             }
15963         }
15964         return;            # None found
15965     }
15966 } # End closure
15967
15968 1;
15969 END
15970
15971     main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
15972     return;
15973 }
15974
15975 sub make_UCD () {
15976     # Create and write UCD.pl, which passes info about the tables to
15977     # Unicode::UCD
15978
15979     # Create a mapping from each alias of Perl single-form extensions to all
15980     # its equivalent aliases, for quick look-up.
15981     my %perlprop_to_aliases;
15982     foreach my $table ($perl->tables) {
15983
15984         # First create the list of the aliases of each extension
15985         my @aliases_list;    # List of legal aliases for this extension
15986
15987         my $table_name = $table->name;
15988         my $standard_table_name = standardize($table_name);
15989         my $table_full_name = $table->full_name;
15990         my $standard_table_full_name = standardize($table_full_name);
15991
15992         # Make sure that the list has both the short and full names
15993         push @aliases_list, $table_name, $table_full_name;
15994
15995         my $found_ucd = 0;  # ? Did we actually get an alias that should be
15996                             # output for this table
15997
15998         # Go through all the aliases (including the two just added), and add
15999         # any new unique ones to the list
16000         foreach my $alias ($table->aliases) {
16001
16002             # Skip non-legal names
16003             next unless $alias->ok_as_filename;
16004             next unless $alias->ucd;
16005
16006             $found_ucd = 1;     # have at least one legal name
16007
16008             my $name = $alias->name;
16009             my $standard = standardize($name);
16010
16011             # Don't repeat a name that is equivalent to one already on the
16012             # list
16013             next if $standard eq $standard_table_name;
16014             next if $standard eq $standard_table_full_name;
16015
16016             push @aliases_list, $name;
16017         }
16018
16019         # If there were no legal names, don't output anything.
16020         next unless $found_ucd;
16021
16022         # To conserve memory in the program reading these in, omit full names
16023         # that are identical to the short name, when those are the only two
16024         # aliases for the property.
16025         if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
16026             pop @aliases_list;
16027         }
16028
16029         # Here, @aliases_list is the list of all the aliases that this
16030         # extension legally has.  Now can create a map to it from each legal
16031         # standardized alias
16032         foreach my $alias ($table->aliases) {
16033             next unless $alias->ucd;
16034             next unless $alias->ok_as_filename;
16035             push @{$perlprop_to_aliases{standardize($alias->name)}},
16036                  @aliases_list;
16037         }
16038     }
16039
16040     # Make a list of all combinations of properties/values that are suppressed.
16041     my @suppressed;
16042     if (! $debug_skip) {    # This tends to fail in this debug mode
16043         foreach my $property_name (keys %why_suppressed) {
16044
16045             # Just the value
16046             my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
16047
16048             # The hash may contain properties not in this release of Unicode
16049             next unless defined (my $property = property_ref($property_name));
16050
16051             # Find all combinations
16052             foreach my $prop_alias ($property->aliases) {
16053                 my $prop_alias_name = standardize($prop_alias->name);
16054
16055                 # If no =value, there's just one combination possibe for this
16056                 if (! $value_name) {
16057
16058                     # The property may be suppressed, but there may be a proxy
16059                     # for it, so it shouldn't be listed as suppressed
16060                     next if $prop_alias->ucd;
16061                     push @suppressed, $prop_alias_name;
16062                 }
16063                 else {  # Otherwise
16064                     foreach my $value_alias
16065                                     ($property->table($value_name)->aliases)
16066                     {
16067                         next if $value_alias->ucd;
16068
16069                         push @suppressed, "$prop_alias_name="
16070                                         .  standardize($value_alias->name);
16071                     }
16072                 }
16073             }
16074         }
16075     }
16076
16077     # Convert the structure below (designed for Name.pm) to a form that UCD
16078     # wants, so it doesn't have to modify it at all; i.e. so that it includes
16079     # an element for the Hangul syllables in the appropriate place, and
16080     # otherwise changes the name to include the "-<code point>" suffix.
16081     my @algorithm_names;
16082     my $done_hangul = 0;
16083
16084     # Copy it linearly.
16085     for my $i (0 .. @code_points_ending_in_code_point - 1) {
16086
16087         # Insert the hanguls in the correct place.
16088         if (! $done_hangul
16089             && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
16090         {
16091             $done_hangul = 1;
16092             push @algorithm_names, { low => $SBase,
16093                                      high => $SBase + $SCount - 1,
16094                                      name => '<hangul syllable>',
16095                                     };
16096         }
16097
16098         # Copy the current entry, modified.
16099         push @algorithm_names, {
16100             low => $code_points_ending_in_code_point[$i]->{'low'},
16101             high => $code_points_ending_in_code_point[$i]->{'high'},
16102             name =>
16103                "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
16104         };
16105     }
16106
16107     # Serialize these structures for output.
16108     my $loose_to_standard_value
16109                           = simple_dumper(\%loose_to_standard_value, ' ' x 4);
16110     chomp $loose_to_standard_value;
16111
16112     my $string_property_loose_to_name
16113                     = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
16114     chomp $string_property_loose_to_name;
16115
16116     my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
16117     chomp $perlprop_to_aliases;
16118
16119     my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
16120     chomp $prop_aliases;
16121
16122     my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
16123     chomp $prop_value_aliases;
16124
16125     my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
16126     chomp $suppressed;
16127
16128     my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
16129     chomp $algorithm_names;
16130
16131     my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
16132     chomp $ambiguous_names;
16133
16134     my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
16135     chomp $loose_defaults;
16136
16137     my @ucd = <<END;
16138 $HEADER
16139 $INTERNAL_ONLY_HEADER
16140
16141 # This file is for the use of Unicode::UCD
16142
16143 # Highest legal Unicode code point
16144 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
16145
16146 # Hangul syllables
16147 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
16148 \$Unicode::UCD::HANGUL_COUNT = $SCount;
16149
16150 # Keys are all the possible "prop=value" combinations, in loose form; values
16151 # are the standard loose name for the 'value' part of the key
16152 \%Unicode::UCD::loose_to_standard_value = (
16153 $loose_to_standard_value
16154 );
16155
16156 # String property loose names to standard loose name
16157 \%Unicode::UCD::string_property_loose_to_name = (
16158 $string_property_loose_to_name
16159 );
16160
16161 # Keys are Perl extensions in loose form; values are each one's list of
16162 # aliases
16163 \%Unicode::UCD::loose_perlprop_to_name = (
16164 $perlprop_to_aliases
16165 );
16166
16167 # Keys are standard property name; values are each one's aliases
16168 \%Unicode::UCD::prop_aliases = (
16169 $prop_aliases
16170 );
16171
16172 # Keys of top level are standard property name; values are keys to another
16173 # hash,  Each one is one of the property's values, in standard form.  The
16174 # values are that prop-val's aliases.  If only one specified, the short and
16175 # long alias are identical.
16176 \%Unicode::UCD::prop_value_aliases = (
16177 $prop_value_aliases
16178 );
16179
16180 # Ordered (by code point ordinal) list of the ranges of code points whose
16181 # names are algorithmically determined.  Each range entry is an anonymous hash
16182 # of the start and end points and a template for the names within it.
16183 \@Unicode::UCD::algorithmic_named_code_points = (
16184 $algorithm_names
16185 );
16186
16187 # The properties that as-is have two meanings, and which must be disambiguated
16188 \%Unicode::UCD::ambiguous_names = (
16189 $ambiguous_names
16190 );
16191
16192 # Keys are the prop-val combinations which are the default values for the
16193 # given property, expressed in standard loose form
16194 \%Unicode::UCD::loose_defaults = (
16195 $loose_defaults
16196 );
16197
16198 # All combinations of names that are suppressed.
16199 # This is actually for UCD.t, so it knows which properties shouldn't have
16200 # entries.  If it got any bigger, would probably want to put it in its own
16201 # file to use memory only when it was needed, in testing.
16202 \@Unicode::UCD::suppressed_properties = (
16203 $suppressed
16204 );
16205
16206 1;
16207 END
16208
16209     main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
16210     return;
16211 }
16212
16213 sub write_all_tables() {
16214     # Write out all the tables generated by this program to files, as well as
16215     # the supporting data structures, pod file, and .t file.
16216
16217     my @writables;              # List of tables that actually get written
16218     my %match_tables_to_write;  # Used to collapse identical match tables
16219                                 # into one file.  Each key is a hash function
16220                                 # result to partition tables into buckets.
16221                                 # Each value is an array of the tables that
16222                                 # fit in the bucket.
16223
16224     # For each property ...
16225     # (sort so that if there is an immutable file name, it has precedence, so
16226     # some other property can't come in and take over its file name.  If b's
16227     # file name is defined, will return 1, meaning to take it first; don't
16228     # care if both defined, as they had better be different anyway.  And the
16229     # property named 'Perl' needs to be first (it doesn't have any immutable
16230     # file name) because empty properties are defined in terms of it's table
16231     # named 'Any'.)
16232     PROPERTY:
16233     foreach my $property (sort { return -1 if $a == $perl;
16234                                  return 1 if $b == $perl;
16235                                  return defined $b->file
16236                                 } property_ref('*'))
16237     {
16238         my $type = $property->type;
16239
16240         # And for each table for that property, starting with the mapping
16241         # table for it ...
16242         TABLE:
16243         foreach my $table($property,
16244
16245                         # and all the match tables for it (if any), sorted so
16246                         # the ones with the shortest associated file name come
16247                         # first.  The length sorting prevents problems of a
16248                         # longer file taking a name that might have to be used
16249                         # by a shorter one.  The alphabetic sorting prevents
16250                         # differences between releases
16251                         sort {  my $ext_a = $a->external_name;
16252                                 return 1 if ! defined $ext_a;
16253                                 my $ext_b = $b->external_name;
16254                                 return -1 if ! defined $ext_b;
16255
16256                                 # But return the non-complement table before
16257                                 # the complement one, as the latter is defined
16258                                 # in terms of the former, and needs to have
16259                                 # the information for the former available.
16260                                 return 1 if $a->complement != 0;
16261                                 return -1 if $b->complement != 0;
16262
16263                                 # Similarly, return a subservient table after
16264                                 # a leader
16265                                 return 1 if $a->leader != $a;
16266                                 return -1 if $b->leader != $b;
16267
16268                                 my $cmp = length $ext_a <=> length $ext_b;
16269
16270                                 # Return result if lengths not equal
16271                                 return $cmp if $cmp;
16272
16273                                 # Alphabetic if lengths equal
16274                                 return $ext_a cmp $ext_b
16275                         } $property->tables
16276                     )
16277         {
16278
16279             # Here we have a table associated with a property.  It could be
16280             # the map table (done first for each property), or one of the
16281             # other tables.  Determine which type.
16282             my $is_property = $table->isa('Property');
16283
16284             my $name = $table->name;
16285             my $complete_name = $table->complete_name;
16286
16287             # See if should suppress the table if is empty, but warn if it
16288             # contains something.
16289             my $suppress_if_empty_warn_if_not
16290                     = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
16291
16292             # Calculate if this table should have any code points associated
16293             # with it or not.
16294             my $expected_empty =
16295
16296                 # $perl should be empty, as well as properties that we just
16297                 # don't do anything with
16298                 ($is_property
16299                     && ($table == $perl
16300                         || grep { $complete_name eq $_ }
16301                                                     @unimplemented_properties
16302                     )
16303                 )
16304
16305                 # Match tables in properties we skipped populating should be
16306                 # empty
16307                 || (! $is_property && ! $property->to_create_match_tables)
16308
16309                 # Tables and properties that are expected to have no code
16310                 # points should be empty
16311                 || $suppress_if_empty_warn_if_not
16312             ;
16313
16314             # Set a boolean if this table is the complement of an empty binary
16315             # table
16316             my $is_complement_of_empty_binary =
16317                 $type == $BINARY &&
16318                 (($table == $property->table('Y')
16319                     && $property->table('N')->is_empty)
16320                 || ($table == $property->table('N')
16321                     && $property->table('Y')->is_empty));
16322
16323             if ($table->is_empty) {
16324
16325                 if ($suppress_if_empty_warn_if_not) {
16326                     $table->set_fate($SUPPRESSED,
16327                                      $suppress_if_empty_warn_if_not);
16328                 }
16329
16330                 # Suppress (by skipping them) expected empty tables.
16331                 next TABLE if $expected_empty;
16332
16333                 # And setup to later output a warning for those that aren't
16334                 # known to be allowed to be empty.  Don't do the warning if
16335                 # this table is a child of another one to avoid duplicating
16336                 # the warning that should come from the parent one.
16337                 if (($table == $property || $table->parent == $table)
16338                     && $table->fate != $SUPPRESSED
16339                     && $table->fate != $MAP_PROXIED
16340                     && ! grep { $complete_name =~ /^$_$/ }
16341                                                     @tables_that_may_be_empty)
16342                 {
16343                     push @unhandled_properties, "$table";
16344                 }
16345
16346                 # An empty table is just the complement of everything.
16347                 $table->set_complement($Any) if $table != $property;
16348             }
16349             elsif ($expected_empty) {
16350                 my $because = "";
16351                 if ($suppress_if_empty_warn_if_not) {
16352                     $because = " because $suppress_if_empty_warn_if_not";
16353                 }
16354
16355                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
16356             }
16357
16358             # Some tables should match everything
16359             my $expected_full =
16360                 ($table->fate == $SUPPRESSED)
16361                 ? 0
16362                 : ($is_property)
16363                   ? # All these types of map tables will be full because
16364                     # they will have been populated with defaults
16365                     ($type == $ENUM || $type == $FORCED_BINARY)
16366
16367                   : # A match table should match everything if its method
16368                     # shows it should
16369                     ($table->matches_all
16370
16371                     # The complement of an empty binary table will match
16372                     # everything
16373                     || $is_complement_of_empty_binary
16374                     )
16375             ;
16376
16377             my $count = $table->count;
16378             if ($expected_full) {
16379                 if ($count != $MAX_UNICODE_CODEPOINTS) {
16380                     Carp::my_carp("$table matches only "
16381                     . clarify_number($count)
16382                     . " Unicode code points but should match "
16383                     . clarify_number($MAX_UNICODE_CODEPOINTS)
16384                     . " (off by "
16385                     .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
16386                     . ").  Proceeding anyway.");
16387                 }
16388
16389                 # Here is expected to be full.  If it is because it is the
16390                 # complement of an (empty) binary table that is to be
16391                 # suppressed, then suppress this one as well.
16392                 if ($is_complement_of_empty_binary) {
16393                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
16394                     my $opposing = $property->table($opposing_name);
16395                     my $opposing_status = $opposing->status;
16396                     if ($opposing_status) {
16397                         $table->set_status($opposing_status,
16398                                            $opposing->status_info);
16399                     }
16400                 }
16401             }
16402             elsif ($count == $MAX_UNICODE_CODEPOINTS
16403                    && ($table == $property || $table->leader == $table)
16404                    && $table->property->status ne $PLACEHOLDER)
16405             {
16406                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
16407             }
16408
16409             if ($table->fate >= $SUPPRESSED) {
16410                 if (! $is_property) {
16411                     my @children = $table->children;
16412                     foreach my $child (@children) {
16413                         if ($child->fate < $SUPPRESSED) {
16414                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
16415                         }
16416                     }
16417                 }
16418                 next TABLE;
16419
16420             }
16421
16422             if (! $is_property) {
16423
16424                 make_ucd_table_pod_entries($table) if $table->property == $perl;
16425
16426                 # Several things need to be done just once for each related
16427                 # group of match tables.  Do them on the parent.
16428                 if ($table->parent == $table) {
16429
16430                     # Add an entry in the pod file for the table; it also does
16431                     # the children.
16432                     make_re_pod_entries($table) if defined $pod_directory;
16433
16434                     # See if the the table matches identical code points with
16435                     # something that has already been output.  In that case,
16436                     # no need to have two files with the same code points in
16437                     # them.  We use the table's hash() method to store these
16438                     # in buckets, so that it is quite likely that if two
16439                     # tables are in the same bucket they will be identical, so
16440                     # don't have to compare tables frequently.  The tables
16441                     # have to have the same status to share a file, so add
16442                     # this to the bucket hash.  (The reason for this latter is
16443                     # that Heavy.pl associates a status with a file.)
16444                     # We don't check tables that are inverses of others, as it
16445                     # would lead to some coding complications, and checking
16446                     # all the regular ones should find everything.
16447                     if ($table->complement == 0) {
16448                         my $hash = $table->hash . ';' . $table->status;
16449
16450                         # Look at each table that is in the same bucket as
16451                         # this one would be.
16452                         foreach my $comparison
16453                                             (@{$match_tables_to_write{$hash}})
16454                         {
16455                             if ($table->matches_identically_to($comparison)) {
16456                                 $table->set_equivalent_to($comparison,
16457                                                                 Related => 0);
16458                                 next TABLE;
16459                             }
16460                         }
16461
16462                         # Here, not equivalent, add this table to the bucket.
16463                         push @{$match_tables_to_write{$hash}}, $table;
16464                     }
16465                 }
16466             }
16467             else {
16468
16469                 # Here is the property itself.
16470                 # Don't write out or make references to the $perl property
16471                 next if $table == $perl;
16472
16473                 make_ucd_table_pod_entries($table);
16474
16475                 # There is a mapping stored of the various synonyms to the
16476                 # standardized name of the property for utf8_heavy.pl.
16477                 # Also, the pod file contains entries of the form:
16478                 # \p{alias: *}         \p{full: *}
16479                 # rather than show every possible combination of things.
16480
16481                 my @property_aliases = $property->aliases;
16482
16483                 my $full_property_name = $property->full_name;
16484                 my $property_name = $property->name;
16485                 my $standard_property_name = standardize($property_name);
16486                 my $standard_property_full_name
16487                                         = standardize($full_property_name);
16488
16489                 # We also create for Unicode::UCD a list of aliases for
16490                 # the property.  The list starts with the property name;
16491                 # then its full name.
16492                 my @property_list;
16493                 my @standard_list;
16494                 if ( $property->fate <= $MAP_PROXIED) {
16495                     @property_list = ($property_name, $full_property_name);
16496                     @standard_list = ($standard_property_name,
16497                                         $standard_property_full_name);
16498                 }
16499
16500                 # For each synonym ...
16501                 for my $i (0 .. @property_aliases - 1)  {
16502                     my $alias = $property_aliases[$i];
16503                     my $alias_name = $alias->name;
16504                     my $alias_standard = standardize($alias_name);
16505
16506
16507                     # Add other aliases to the list of property aliases
16508                     if ($property->fate <= $MAP_PROXIED
16509                         && ! grep { $alias_standard eq $_ } @standard_list)
16510                     {
16511                         push @property_list, $alias_name;
16512                         push @standard_list, $alias_standard;
16513                     }
16514
16515                     # For utf8_heavy, set the mapping of the alias to the
16516                     # property
16517                     if ($type == $STRING) {
16518                         if ($property->fate <= $MAP_PROXIED) {
16519                             $string_property_loose_to_name{$alias_standard}
16520                                             = $standard_property_name;
16521                         }
16522                     }
16523                     else {
16524                         if (exists ($loose_property_name_of{$alias_standard}))
16525                         {
16526                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
16527                         }
16528                         else {
16529                             $loose_property_name_of{$alias_standard}
16530                                                 = $standard_property_name;
16531                         }
16532
16533                         # Now for the re pod entry for this alias.  Skip if not
16534                         # outputting a pod; skip the first one, which is the
16535                         # full name so won't have an entry like: '\p{full: *}
16536                         # \p{full: *}', and skip if don't want an entry for
16537                         # this one.
16538                         next if $i == 0
16539                                 || ! defined $pod_directory
16540                                 || ! $alias->make_re_pod_entry;
16541
16542                         my $rhs = "\\p{$full_property_name: *}";
16543                         if ($property != $perl && $table->perl_extension) {
16544                             $rhs .= ' (Perl extension)';
16545                         }
16546                         push @match_properties,
16547                             format_pod_line($indent_info_column,
16548                                         '\p{' . $alias->name . ': *}',
16549                                         $rhs,
16550                                         $alias->status);
16551                     }
16552                 }
16553
16554                 # The list of all possible names is attached to each alias, so
16555                 # lookup is easy
16556                 if (@property_list) {
16557                     push @{$prop_aliases{$standard_list[0]}}, @property_list;
16558                 }
16559
16560                 if ($property->fate <= $MAP_PROXIED) {
16561
16562                     # Similarly, we create for Unicode::UCD a list of
16563                     # property-value aliases.
16564
16565                     my $property_full_name = $property->full_name;
16566
16567                     # Look at each table in the property...
16568                     foreach my $table ($property->tables) {
16569                         my @values_list;
16570                         my $table_full_name = $table->full_name;
16571                         my $standard_table_full_name
16572                                               = standardize($table_full_name);
16573                         my $table_name = $table->name;
16574                         my $standard_table_name = standardize($table_name);
16575
16576                         # The list starts with the table name and its full
16577                         # name.
16578                         push @values_list, $table_name, $table_full_name;
16579
16580                         # We add to the table each unique alias that isn't
16581                         # discouraged from use.
16582                         foreach my $alias ($table->aliases) {
16583                             next if $alias->status
16584                                  && $alias->status eq $DISCOURAGED;
16585                             my $name = $alias->name;
16586                             my $standard = standardize($name);
16587                             next if $standard eq $standard_table_name;
16588                             next if $standard eq $standard_table_full_name;
16589                             push @values_list, $name;
16590                         }
16591
16592                         # Here @values_list is a list of all the aliases for
16593                         # the table.  That is, all the property-values given
16594                         # by this table.  By agreement with Unicode::UCD,
16595                         # if the name and full name are identical, and there
16596                         # are no other names, drop the duplcate entry to save
16597                         # memory.
16598                         if (@values_list == 2
16599                             && $values_list[0] eq $values_list[1])
16600                         {
16601                             pop @values_list
16602                         }
16603
16604                         # To save memory, unlike the similar list for property
16605                         # aliases above, only the standard forms hve the list.
16606                         # This forces an extra step of converting from input
16607                         # name to standard name, but the savings are
16608                         # considerable.  (There is only marginal savings if we
16609                         # did this with the property aliases.)
16610                         push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
16611                     }
16612                 }
16613
16614                 # Don't write out a mapping file if not desired.
16615                 next if ! $property->to_output_map;
16616             }
16617
16618             # Here, we know we want to write out the table, but don't do it
16619             # yet because there may be other tables that come along and will
16620             # want to share the file, and the file's comments will change to
16621             # mention them.  So save for later.
16622             push @writables, $table;
16623
16624         } # End of looping through the property and all its tables.
16625     } # End of looping through all properties.
16626
16627     # Now have all the tables that will have files written for them.  Do it.
16628     foreach my $table (@writables) {
16629         my @directory;
16630         my $filename;
16631         my $property = $table->property;
16632         my $is_property = ($table == $property);
16633         if (! $is_property) {
16634
16635             # Match tables for the property go in lib/$subdirectory, which is
16636             # the property's name.  Don't use the standard file name for this,
16637             # as may get an unfamiliar alias
16638             @directory = ($matches_directory, $property->external_name);
16639         }
16640         else {
16641
16642             @directory = $table->directory;
16643             $filename = $table->file;
16644         }
16645
16646         # Use specified filename if available, or default to property's
16647         # shortest name.  We need an 8.3 safe filename (which means "an 8
16648         # safe" filename, since after the dot is only 'pl', which is < 3)
16649         # The 2nd parameter is if the filename shouldn't be changed, and
16650         # it shouldn't iff there is a hard-coded name for this table.
16651         $filename = construct_filename(
16652                                 $filename || $table->external_name,
16653                                 ! $filename,    # mutable if no filename
16654                                 \@directory);
16655
16656         register_file_for_name($table, \@directory, $filename);
16657
16658         # Only need to write one file when shared by more than one
16659         # property
16660         next if ! $is_property
16661                 && ($table->leader != $table || $table->complement != 0);
16662
16663         # Construct a nice comment to add to the file
16664         $table->set_final_comment;
16665
16666         $table->write;
16667     }
16668
16669
16670     # Write out the pod file
16671     make_pod;
16672
16673     # And Heavy.pl, Name.pm, UCD.pl
16674     make_Heavy;
16675     make_Name_pm;
16676     make_UCD;
16677
16678     make_property_test_script() if $make_test_script;
16679     make_normalization_test_script() if $make_norm_test_script;
16680     return;
16681 }
16682
16683 my @white_space_separators = ( # This used only for making the test script.
16684                             "",
16685                             ' ',
16686                             "\t",
16687                             '   '
16688                         );
16689
16690 sub generate_separator($) {
16691     # This used only for making the test script.  It generates the colon or
16692     # equal separator between the property and property value, with random
16693     # white space surrounding the separator
16694
16695     my $lhs = shift;
16696
16697     return "" if $lhs eq "";  # No separator if there's only one (the r) side
16698
16699     # Choose space before and after randomly
16700     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
16701     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
16702
16703     # And return the whole complex, half the time using a colon, half the
16704     # equals
16705     return $spaces_before
16706             . (rand() < 0.5) ? '=' : ':'
16707             . $spaces_after;
16708 }
16709
16710 sub generate_tests($$$$$) {
16711     # This used only for making the test script.  It generates test cases that
16712     # are expected to compile successfully in perl.  Note that the lhs and
16713     # rhs are assumed to already be as randomized as the caller wants.
16714
16715     my $lhs = shift;           # The property: what's to the left of the colon
16716                                #  or equals separator
16717     my $rhs = shift;           # The property value; what's to the right
16718     my $valid_code = shift;    # A code point that's known to be in the
16719                                # table given by lhs=rhs; undef if table is
16720                                # empty
16721     my $invalid_code = shift;  # A code point known to not be in the table;
16722                                # undef if the table is all code points
16723     my $warning = shift;
16724
16725     # Get the colon or equal
16726     my $separator = generate_separator($lhs);
16727
16728     # The whole 'property=value'
16729     my $name = "$lhs$separator$rhs";
16730
16731     my @output;
16732     # Create a complete set of tests, with complements.
16733     if (defined $valid_code) {
16734         push @output, <<"EOC"
16735 Expect(1, $valid_code, '\\p{$name}', $warning);
16736 Expect(0, $valid_code, '\\p{^$name}', $warning);
16737 Expect(0, $valid_code, '\\P{$name}', $warning);
16738 Expect(1, $valid_code, '\\P{^$name}', $warning);
16739 EOC
16740     }
16741     if (defined $invalid_code) {
16742         push @output, <<"EOC"
16743 Expect(0, $invalid_code, '\\p{$name}', $warning);
16744 Expect(1, $invalid_code, '\\p{^$name}', $warning);
16745 Expect(1, $invalid_code, '\\P{$name}', $warning);
16746 Expect(0, $invalid_code, '\\P{^$name}', $warning);
16747 EOC
16748     }
16749     return @output;
16750 }
16751
16752 sub generate_error($$$) {
16753     # This used only for making the test script.  It generates test cases that
16754     # are expected to not only not match, but to be syntax or similar errors
16755
16756     my $lhs = shift;                # The property: what's to the left of the
16757                                     # colon or equals separator
16758     my $rhs = shift;                # The property value; what's to the right
16759     my $already_in_error = shift;   # Boolean; if true it's known that the
16760                                 # unmodified lhs and rhs will cause an error.
16761                                 # This routine should not force another one
16762     # Get the colon or equal
16763     my $separator = generate_separator($lhs);
16764
16765     # Since this is an error only, don't bother to randomly decide whether to
16766     # put the error on the left or right side; and assume that the rhs is
16767     # loosely matched, again for convenience rather than rigor.
16768     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
16769
16770     my $property = $lhs . $separator . $rhs;
16771
16772     return <<"EOC";
16773 Error('\\p{$property}');
16774 Error('\\P{$property}');
16775 EOC
16776 }
16777
16778 # These are used only for making the test script
16779 # XXX Maybe should also have a bad strict seps, which includes underscore.
16780
16781 my @good_loose_seps = (
16782             " ",
16783             "-",
16784             "\t",
16785             "",
16786             "_",
16787            );
16788 my @bad_loose_seps = (
16789            "/a/",
16790            ':=',
16791           );
16792
16793 sub randomize_stricter_name {
16794     # This used only for making the test script.  Take the input name and
16795     # return a randomized, but valid version of it under the stricter matching
16796     # rules.
16797
16798     my $name = shift;
16799     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16800
16801     # If the name looks like a number (integer, floating, or rational), do
16802     # some extra work
16803     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
16804         my $sign = $1;
16805         my $number = $2;
16806         my $separator = $3;
16807
16808         # If there isn't a sign, part of the time add a plus
16809         # Note: Not testing having any denominator having a minus sign
16810         if (! $sign) {
16811             $sign = '+' if rand() <= .3;
16812         }
16813
16814         # And add 0 or more leading zeros.
16815         $name = $sign . ('0' x int rand(10)) . $number;
16816
16817         if (defined $separator) {
16818             my $extra_zeros = '0' x int rand(10);
16819
16820             if ($separator eq '.') {
16821
16822                 # Similarly, add 0 or more trailing zeros after a decimal
16823                 # point
16824                 $name .= $extra_zeros;
16825             }
16826             else {
16827
16828                 # Or, leading zeros before the denominator
16829                 $name =~ s,/,/$extra_zeros,;
16830             }
16831         }
16832     }
16833
16834     # For legibility of the test, only change the case of whole sections at a
16835     # time.  To do this, first split into sections.  The split returns the
16836     # delimiters
16837     my @sections;
16838     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
16839         trace $section if main::DEBUG && $to_trace;
16840
16841         if (length $section > 1 && $section !~ /\D/) {
16842
16843             # If the section is a sequence of digits, about half the time
16844             # randomly add underscores between some of them.
16845             if (rand() > .5) {
16846
16847                 # Figure out how many underscores to add.  max is 1 less than
16848                 # the number of digits.  (But add 1 at the end to make sure
16849                 # result isn't 0, and compensate earlier by subtracting 2
16850                 # instead of 1)
16851                 my $num_underscores = int rand(length($section) - 2) + 1;
16852
16853                 # And add them evenly throughout, for convenience, not rigor
16854                 use integer;
16855                 my $spacing = (length($section) - 1)/ $num_underscores;
16856                 my $temp = $section;
16857                 $section = "";
16858                 for my $i (1 .. $num_underscores) {
16859                     $section .= substr($temp, 0, $spacing, "") . '_';
16860                 }
16861                 $section .= $temp;
16862             }
16863             push @sections, $section;
16864         }
16865         else {
16866
16867             # Here not a sequence of digits.  Change the case of the section
16868             # randomly
16869             my $switch = int rand(4);
16870             if ($switch == 0) {
16871                 push @sections, uc $section;
16872             }
16873             elsif ($switch == 1) {
16874                 push @sections, lc $section;
16875             }
16876             elsif ($switch == 2) {
16877                 push @sections, ucfirst $section;
16878             }
16879             else {
16880                 push @sections, $section;
16881             }
16882         }
16883     }
16884     trace "returning", join "", @sections if main::DEBUG && $to_trace;
16885     return join "", @sections;
16886 }
16887
16888 sub randomize_loose_name($;$) {
16889     # This used only for making the test script
16890
16891     my $name = shift;
16892     my $want_error = shift;  # if true, make an error
16893     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16894
16895     $name = randomize_stricter_name($name);
16896
16897     my @parts;
16898     push @parts, $good_loose_seps[rand(@good_loose_seps)];
16899
16900     # Preserve trailing ones for the sake of not stripping the underscore from
16901     # 'L_'
16902     for my $part (split /[-\s_]+ (?= . )/, $name) {
16903         if (@parts) {
16904             if ($want_error and rand() < 0.3) {
16905                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
16906                 $want_error = 0;
16907             }
16908             else {
16909                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
16910             }
16911         }
16912         push @parts, $part;
16913     }
16914     my $new = join("", @parts);
16915     trace "$name => $new" if main::DEBUG && $to_trace;
16916
16917     if ($want_error) {
16918         if (rand() >= 0.5) {
16919             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
16920         }
16921         else {
16922             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
16923         }
16924     }
16925     return $new;
16926 }
16927
16928 # Used to make sure don't generate duplicate test cases.
16929 my %test_generated;
16930
16931 sub make_property_test_script() {
16932     # This used only for making the test script
16933     # this written directly -- it's huge.
16934
16935     print "Making test script\n" if $verbosity >= $PROGRESS;
16936
16937     # This uses randomness to test different possibilities without testing all
16938     # possibilities.  To ensure repeatability, set the seed to 0.  But if
16939     # tests are added, it will perturb all later ones in the .t file
16940     srand 0;
16941
16942     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
16943
16944     # Keep going down an order of magnitude
16945     # until find that adding this quantity to
16946     # 1 remains 1; but put an upper limit on
16947     # this so in case this algorithm doesn't
16948     # work properly on some platform, that we
16949     # won't loop forever.
16950     my $digits = 0;
16951     my $min_floating_slop = 1;
16952     while (1+ $min_floating_slop != 1
16953             && $digits++ < 50)
16954     {
16955         my $next = $min_floating_slop / 10;
16956         last if $next == 0; # If underflows,
16957                             # use previous one
16958         $min_floating_slop = $next;
16959     }
16960
16961     # It doesn't matter whether the elements of this array contain single lines
16962     # or multiple lines. main::write doesn't count the lines.
16963     my @output;
16964
16965     foreach my $property (property_ref('*')) {
16966         foreach my $table ($property->tables) {
16967
16968             # Find code points that match, and don't match this table.
16969             my $valid = $table->get_valid_code_point;
16970             my $invalid = $table->get_invalid_code_point;
16971             my $warning = ($table->status eq $DEPRECATED)
16972                             ? "'deprecated'"
16973                             : '""';
16974
16975             # Test each possible combination of the property's aliases with
16976             # the table's.  If this gets to be too many, could do what is done
16977             # in the set_final_comment() for Tables
16978             my @table_aliases = $table->aliases;
16979             my @property_aliases = $table->property->aliases;
16980
16981             # Every property can be optionally be prefixed by 'Is_', so test
16982             # that those work, by creating such a new alias for each
16983             # pre-existing one.
16984             push @property_aliases, map { Alias->new("Is_" . $_->name,
16985                                                     $_->loose_match,
16986                                                     $_->make_re_pod_entry,
16987                                                     $_->ok_as_filename,
16988                                                     $_->status,
16989                                                     $_->ucd,
16990                                                     )
16991                                          } @property_aliases;
16992             my $max = max(scalar @table_aliases, scalar @property_aliases);
16993             for my $j (0 .. $max - 1) {
16994
16995                 # The current alias for property is the next one on the list,
16996                 # or if beyond the end, start over.  Similarly for table
16997                 my $property_name
16998                             = $property_aliases[$j % @property_aliases]->name;
16999
17000                 $property_name = "" if $table->property == $perl;
17001                 my $table_alias = $table_aliases[$j % @table_aliases];
17002                 my $table_name = $table_alias->name;
17003                 my $loose_match = $table_alias->loose_match;
17004
17005                 # If the table doesn't have a file, any test for it is
17006                 # already guaranteed to be in error
17007                 my $already_error = ! $table->file_path;
17008
17009                 # Generate error cases for this alias.
17010                 push @output, generate_error($property_name,
17011                                              $table_name,
17012                                              $already_error);
17013
17014                 # If the table is guaranteed to always generate an error,
17015                 # quit now without generating success cases.
17016                 next if $already_error;
17017
17018                 # Now for the success cases.
17019                 my $random;
17020                 if ($loose_match) {
17021
17022                     # For loose matching, create an extra test case for the
17023                     # standard name.
17024                     my $standard = standardize($table_name);
17025
17026                     # $test_name should be a unique combination for each test
17027                     # case; used just to avoid duplicate tests
17028                     my $test_name = "$property_name=$standard";
17029
17030                     # Don't output duplicate test cases.
17031                     if (! exists $test_generated{$test_name}) {
17032                         $test_generated{$test_name} = 1;
17033                         push @output, generate_tests($property_name,
17034                                                      $standard,
17035                                                      $valid,
17036                                                      $invalid,
17037                                                      $warning,
17038                                                  );
17039                     }
17040                     $random = randomize_loose_name($table_name)
17041                 }
17042                 else { # Stricter match
17043                     $random = randomize_stricter_name($table_name);
17044                 }
17045
17046                 # Now for the main test case for this alias.
17047                 my $test_name = "$property_name=$random";
17048                 if (! exists $test_generated{$test_name}) {
17049                     $test_generated{$test_name} = 1;
17050                     push @output, generate_tests($property_name,
17051                                                  $random,
17052                                                  $valid,
17053                                                  $invalid,
17054                                                  $warning,
17055                                              );
17056
17057                     # If the name is a rational number, add tests for the
17058                     # floating point equivalent.
17059                     if ($table_name =~ qr{/}) {
17060
17061                         # Calculate the float, and find just the fraction.
17062                         my $float = eval $table_name;
17063                         my ($whole, $fraction)
17064                                             = $float =~ / (.*) \. (.*) /x;
17065
17066                         # Starting with one digit after the decimal point,
17067                         # create a test for each possible precision (number of
17068                         # digits past the decimal point) until well beyond the
17069                         # native number found on this machine.  (If we started
17070                         # with 0 digits, it would be an integer, which could
17071                         # well match an unrelated table)
17072                         PLACE:
17073                         for my $i (1 .. $min_floating_slop + 3) {
17074                             my $table_name = sprintf("%.*f", $i, $float);
17075                             if ($i < $MIN_FRACTION_LENGTH) {
17076
17077                                 # If the test case has fewer digits than the
17078                                 # minimum acceptable precision, it shouldn't
17079                                 # succeed, so we expect an error for it.
17080                                 # E.g., 2/3 = .7 at one decimal point, and we
17081                                 # shouldn't say it matches .7.  We should make
17082                                 # it be .667 at least before agreeing that the
17083                                 # intent was to match 2/3.  But at the
17084                                 # less-than- acceptable level of precision, it
17085                                 # might actually match an unrelated number.
17086                                 # So don't generate a test case if this
17087                                 # conflating is possible.  In our example, we
17088                                 # don't want 2/3 matching 7/10, if there is
17089                                 # a 7/10 code point.
17090                                 for my $existing
17091                                         (keys %nv_floating_to_rational)
17092                                 {
17093                                     next PLACE
17094                                         if abs($table_name - $existing)
17095                                                 < $MAX_FLOATING_SLOP;
17096                                 }
17097                                 push @output, generate_error($property_name,
17098                                                              $table_name,
17099                                                              1   # 1 => already an error
17100                                               );
17101                             }
17102                             else {
17103
17104                                 # Here the number of digits exceeds the
17105                                 # minimum we think is needed.  So generate a
17106                                 # success test case for it.
17107                                 push @output, generate_tests($property_name,
17108                                                              $table_name,
17109                                                              $valid,
17110                                                              $invalid,
17111                                                              $warning,
17112                                              );
17113                             }
17114                         }
17115                     }
17116                 }
17117             }
17118         }
17119     }
17120
17121     &write($t_path,
17122            0,           # Not utf8;
17123            [<DATA>,
17124             @output,
17125             (map {"Test_X('$_');\n"} @backslash_X_tests),
17126             "Finished();\n"]);
17127     return;
17128 }
17129
17130 sub make_normalization_test_script() {
17131     print "Making normalization test script\n" if $verbosity >= $PROGRESS;
17132
17133     my $n_path = 'TestNorm.pl';
17134
17135     unshift @normalization_tests, <<'END';
17136 use utf8;
17137 use Test::More;
17138
17139 sub ord_string {    # Convert packed ords to printable string
17140     use charnames ();
17141     return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
17142                                                 unpack "U*", shift) .  "'";
17143     #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
17144 }
17145
17146 sub Test_N {
17147     my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
17148     my $display_source = ord_string($source);
17149     my $display_nfc = ord_string($nfc);
17150     my $display_nfd = ord_string($nfd);
17151     my $display_nfkc = ord_string($nfkc);
17152     my $display_nfkd = ord_string($nfkd);
17153
17154     use Unicode::Normalize;
17155     #    NFC
17156     #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
17157     #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
17158     #
17159     #    NFD
17160     #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
17161     #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
17162     #
17163     #    NFKC
17164     #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
17165     #      toNFKC(nfkc) == toNFKC(nfkd)
17166     #
17167     #    NFKD
17168     #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
17169     #      toNFKD(nfkc) == toNFKD(nfkd)
17170
17171     is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
17172     is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
17173     is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
17174     is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
17175     is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
17176
17177     is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
17178     is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
17179     is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
17180     is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
17181     is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
17182
17183     is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
17184     is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
17185     is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
17186     is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
17187     is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
17188
17189     is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
17190     is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
17191     is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
17192     is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
17193     is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
17194 }
17195 END
17196
17197     &write($n_path,
17198            1,           # Is utf8;
17199            [
17200             @normalization_tests,
17201             'done_testing();'
17202             ]);
17203     return;
17204 }
17205
17206 # This is a list of the input files and how to handle them.  The files are
17207 # processed in their order in this list.  Some reordering is possible if
17208 # desired, but the v0 files should be first, and the extracted before the
17209 # others except DAge.txt (as data in an extracted file can be over-ridden by
17210 # the non-extracted.  Some other files depend on data derived from an earlier
17211 # file, like UnicodeData requires data from Jamo, and the case changing and
17212 # folding requires data from Unicode.  Mostly, it is safest to order by first
17213 # version releases in (except the Jamo).  DAge.txt is read before the
17214 # extracted ones because of the rarely used feature $compare_versions.  In the
17215 # unlikely event that there were ever an extracted file that contained the Age
17216 # property information, it would have to go in front of DAge.
17217 #
17218 # The version strings allow the program to know whether to expect a file or
17219 # not, but if a file exists in the directory, it will be processed, even if it
17220 # is in a version earlier than expected, so you can copy files from a later
17221 # release into an earlier release's directory.
17222 my @input_file_objects = (
17223     Input_file->new('PropertyAliases.txt', v0,
17224                     Handler => \&process_PropertyAliases,
17225                     ),
17226     Input_file->new(undef, v0,  # No file associated with this
17227                     Progress_Message => 'Finishing property setup',
17228                     Handler => \&finish_property_setup,
17229                     ),
17230     Input_file->new('PropValueAliases.txt', v0,
17231                      Handler => \&process_PropValueAliases,
17232                      Has_Missings_Defaults => $NOT_IGNORED,
17233                      ),
17234     Input_file->new('DAge.txt', v3.2.0,
17235                     Has_Missings_Defaults => $NOT_IGNORED,
17236                     Property => 'Age'
17237                     ),
17238     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
17239                     Property => 'General_Category',
17240                     ),
17241     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
17242                     Property => 'Canonical_Combining_Class',
17243                     Has_Missings_Defaults => $NOT_IGNORED,
17244                     ),
17245     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
17246                     Property => 'Numeric_Type',
17247                     Has_Missings_Defaults => $NOT_IGNORED,
17248                     ),
17249     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
17250                     Property => 'East_Asian_Width',
17251                     Has_Missings_Defaults => $NOT_IGNORED,
17252                     ),
17253     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
17254                     Property => 'Line_Break',
17255                     Has_Missings_Defaults => $NOT_IGNORED,
17256                     ),
17257     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
17258                     Property => 'Bidi_Class',
17259                     Has_Missings_Defaults => $NOT_IGNORED,
17260                     ),
17261     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
17262                     Property => 'Decomposition_Type',
17263                     Has_Missings_Defaults => $NOT_IGNORED,
17264                     ),
17265     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
17266     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
17267                     Property => 'Numeric_Value',
17268                     Each_Line_Handler => \&filter_numeric_value_line,
17269                     Has_Missings_Defaults => $NOT_IGNORED,
17270                     ),
17271     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
17272                     Property => 'Joining_Group',
17273                     Has_Missings_Defaults => $NOT_IGNORED,
17274                     ),
17275
17276     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
17277                     Property => 'Joining_Type',
17278                     Has_Missings_Defaults => $NOT_IGNORED,
17279                     ),
17280     Input_file->new('Jamo.txt', v2.0.0,
17281                     Property => 'Jamo_Short_Name',
17282                     Each_Line_Handler => \&filter_jamo_line,
17283                     ),
17284     Input_file->new('UnicodeData.txt', v1.1.5,
17285                     Pre_Handler => \&setup_UnicodeData,
17286
17287                     # We clean up this file for some early versions.
17288                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
17289                                             ? \&filter_v1_ucd
17290                                             : ($v_version eq v2.1.5)
17291                                                 ? \&filter_v2_1_5_ucd
17292
17293                                                 # And for 5.14 Perls with 6.0,
17294                                                 # have to also make changes
17295                                                 : ($v_version ge v6.0.0
17296                                                    && $^V lt v5.17.0)
17297                                                     ? \&filter_v6_ucd
17298                                                     : undef),
17299
17300                                             # Early versions did not have the
17301                                             # proper Unicode_1 names for the
17302                                             # controls
17303                                             (($v_version lt v3.0.0)
17304                                             ? \&filter_early_U1_names
17305                                             : undef),
17306
17307                                             # Early versions did not correctly
17308                                             # use the later method for giving
17309                                             # decimal digit values
17310                                             (($v_version le v3.2.0)
17311                                             ? \&filter_bad_Nd_ucd
17312                                             : undef),
17313
17314                                             # And the main filter
17315                                             \&filter_UnicodeData_line,
17316                                          ],
17317                     EOF_Handler => \&EOF_UnicodeData,
17318                     ),
17319     Input_file->new('ArabicShaping.txt', v2.0.0,
17320                     Each_Line_Handler =>
17321                         [ ($v_version lt 4.1.0)
17322                                     ? \&filter_old_style_arabic_shaping
17323                                     : undef,
17324                         \&filter_arabic_shaping_line,
17325                         ],
17326                     Has_Missings_Defaults => $NOT_IGNORED,
17327                     ),
17328     Input_file->new('Blocks.txt', v2.0.0,
17329                     Property => 'Block',
17330                     Has_Missings_Defaults => $NOT_IGNORED,
17331                     Each_Line_Handler => \&filter_blocks_lines
17332                     ),
17333     Input_file->new('PropList.txt', v2.0.0,
17334                     Each_Line_Handler => (($v_version lt v3.1.0)
17335                                             ? \&filter_old_style_proplist
17336                                             : undef),
17337                     ),
17338     Input_file->new('Unihan.txt', v2.0.0,
17339                     Pre_Handler => \&setup_unihan,
17340                     Optional => 1,
17341                     Each_Line_Handler => \&filter_unihan_line,
17342                         ),
17343     Input_file->new('SpecialCasing.txt', v2.1.8,
17344                     Each_Line_Handler => ($v_version eq 2.1.8)
17345                                          ? \&filter_2_1_8_special_casing_line
17346                                          : \&filter_special_casing_line,
17347                     Pre_Handler => \&setup_special_casing,
17348                     Has_Missings_Defaults => $IGNORED,
17349                     ),
17350     Input_file->new(
17351                     'LineBreak.txt', v3.0.0,
17352                     Has_Missings_Defaults => $NOT_IGNORED,
17353                     Property => 'Line_Break',
17354                     # Early versions had problematic syntax
17355                     Each_Line_Handler => (($v_version lt v3.1.0)
17356                                         ? \&filter_early_ea_lb
17357                                         : undef),
17358                     ),
17359     Input_file->new('EastAsianWidth.txt', v3.0.0,
17360                     Property => 'East_Asian_Width',
17361                     Has_Missings_Defaults => $NOT_IGNORED,
17362                     # Early versions had problematic syntax
17363                     Each_Line_Handler => (($v_version lt v3.1.0)
17364                                         ? \&filter_early_ea_lb
17365                                         : undef),
17366                     ),
17367     Input_file->new('CompositionExclusions.txt', v3.0.0,
17368                     Property => 'Composition_Exclusion',
17369                     ),
17370     Input_file->new('BidiMirroring.txt', v3.0.1,
17371                     Property => 'Bidi_Mirroring_Glyph',
17372                     Has_Missings_Defaults => ($v_version lt v6.2.0)
17373                                               ? $NO_DEFAULTS
17374                                               # Is <none> which doesn't mean
17375                                               # anything to us, we will use the
17376                                               # null string
17377                                               : $IGNORED,
17378
17379                     ),
17380     Input_file->new("NormTest.txt", v3.0.0,
17381                      Handler => \&process_NormalizationsTest,
17382                      Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
17383                     ),
17384     Input_file->new('CaseFolding.txt', v3.0.1,
17385                     Pre_Handler => \&setup_case_folding,
17386                     Each_Line_Handler =>
17387                         [ ($v_version lt v3.1.0)
17388                                  ? \&filter_old_style_case_folding
17389                                  : undef,
17390                            \&filter_case_folding_line
17391                         ],
17392                     Has_Missings_Defaults => $IGNORED,
17393                     ),
17394     Input_file->new('DCoreProperties.txt', v3.1.0,
17395                     # 5.2 changed this file
17396                     Has_Missings_Defaults => (($v_version ge v5.2.0)
17397                                             ? $NOT_IGNORED
17398                                             : $NO_DEFAULTS),
17399                     ),
17400     Input_file->new('Scripts.txt', v3.1.0,
17401                     Property => 'Script',
17402                     Has_Missings_Defaults => $NOT_IGNORED,
17403                     ),
17404     Input_file->new('DNormalizationProps.txt', v3.1.0,
17405                     Has_Missings_Defaults => $NOT_IGNORED,
17406                     Each_Line_Handler => (($v_version lt v4.0.1)
17407                                       ? \&filter_old_style_normalization_lines
17408                                       : undef),
17409                     ),
17410     Input_file->new('HangulSyllableType.txt', v0,
17411                     Has_Missings_Defaults => $NOT_IGNORED,
17412                     Property => 'Hangul_Syllable_Type',
17413                     Pre_Handler => ($v_version lt v4.0.0)
17414                                    ? \&generate_hst
17415                                    : undef,
17416                     ),
17417     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
17418                     Property => 'Word_Break',
17419                     Has_Missings_Defaults => $NOT_IGNORED,
17420                     ),
17421     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
17422                     Property => 'Grapheme_Cluster_Break',
17423                     Has_Missings_Defaults => $NOT_IGNORED,
17424                     Pre_Handler => ($v_version lt v4.1.0)
17425                                    ? \&generate_GCB
17426                                    : undef,
17427                     ),
17428     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
17429                     Handler => \&process_GCB_test,
17430                     ),
17431     Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
17432                     Skip => 'Validation Tests',
17433                     ),
17434     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
17435                     Skip => 'Validation Tests',
17436                     ),
17437     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
17438                     Skip => 'Validation Tests',
17439                     ),
17440     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
17441                     Property => 'Sentence_Break',
17442                     Has_Missings_Defaults => $NOT_IGNORED,
17443                     ),
17444     Input_file->new('NamedSequences.txt', v4.1.0,
17445                     Handler => \&process_NamedSequences
17446                     ),
17447     Input_file->new('NameAliases.txt', v0,
17448                     Property => 'Name_Alias',
17449                     Pre_Handler => ($v_version le v6.0.0)
17450                                    ? \&setup_early_name_alias
17451                                    : undef,
17452                     Each_Line_Handler => ($v_version le v6.0.0)
17453                                    ? \&filter_early_version_name_alias_line
17454                                    : \&filter_later_version_name_alias_line,
17455                     ),
17456     Input_file->new("BidiTest.txt", v5.2.0,
17457                     Skip => 'Validation Tests',
17458                     ),
17459     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
17460                     Optional => 1,
17461                     Each_Line_Handler => \&filter_unihan_line,
17462                     ),
17463     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
17464                     Optional => 1,
17465                     Each_Line_Handler => \&filter_unihan_line,
17466                     ),
17467     Input_file->new('UnihanIRGSources.txt', v5.2.0,
17468                     Optional => 1,
17469                     Pre_Handler => \&setup_unihan,
17470                     Each_Line_Handler => \&filter_unihan_line,
17471                     ),
17472     Input_file->new('UnihanNumericValues.txt', v5.2.0,
17473                     Optional => 1,
17474                     Each_Line_Handler => \&filter_unihan_line,
17475                     ),
17476     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
17477                     Optional => 1,
17478                     Each_Line_Handler => \&filter_unihan_line,
17479                     ),
17480     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
17481                     Optional => 1,
17482                     Each_Line_Handler => \&filter_unihan_line,
17483                     ),
17484     Input_file->new('UnihanReadings.txt', v5.2.0,
17485                     Optional => 1,
17486                     Each_Line_Handler => \&filter_unihan_line,
17487                     ),
17488     Input_file->new('UnihanVariants.txt', v5.2.0,
17489                     Optional => 1,
17490                     Each_Line_Handler => \&filter_unihan_line,
17491                     ),
17492     Input_file->new('ScriptExtensions.txt', v6.0.0,
17493                     Property => 'Script_Extensions',
17494                     Pre_Handler => \&setup_script_extensions,
17495                     Each_Line_Handler => \&filter_script_extensions_line,
17496                     Has_Missings_Defaults => (($v_version le v6.0.0)
17497                                             ? $NO_DEFAULTS
17498                                             : $IGNORED),
17499                     ),
17500     # The two Indic files are actually available starting in v6.0.0, but their
17501     # property values are missing from PropValueAliases.txt in that release,
17502     # so that further work would have to be done to get them to work properly
17503     # for that release.
17504     Input_file->new('IndicMatraCategory.txt', v6.1.0,
17505                     Property => 'Indic_Matra_Category',
17506                     Has_Missings_Defaults => $NOT_IGNORED,
17507                     Skip => "Provisional; for the analysis and processing of Indic scripts",
17508                     ),
17509     Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
17510                     Property => 'Indic_Syllabic_Category',
17511                     Has_Missings_Defaults => $NOT_IGNORED,
17512                     Skip => "Provisional; for the analysis and processing of Indic scripts",
17513                     ),
17514 );
17515
17516 # End of all the preliminaries.
17517 # Do it...
17518
17519 if ($compare_versions) {
17520     Carp::my_carp(<<END
17521 Warning.  \$compare_versions is set.  Output is not suitable for production
17522 END
17523     );
17524 }
17525
17526 # Put into %potential_files a list of all the files in the directory structure
17527 # that could be inputs to this program, excluding those that we should ignore.
17528 # Use absolute file names because it makes it easier across machine types.
17529 my @ignored_files_full_names = map { File::Spec->rel2abs(
17530                                      internal_file_to_platform($_))
17531                                 } keys %ignored_files;
17532 File::Find::find({
17533     wanted=>sub {
17534         return unless /\.txt$/i;  # Some platforms change the name's case
17535         my $full = lc(File::Spec->rel2abs($_));
17536         $potential_files{$full} = 1
17537                     if ! grep { $full eq lc($_) } @ignored_files_full_names;
17538         return;
17539     }
17540 }, File::Spec->curdir());
17541
17542 my @mktables_list_output_files;
17543 my $old_start_time = 0;
17544
17545 if (! -e $file_list) {
17546     print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
17547     $write_unchanged_files = 1;
17548 } elsif ($write_unchanged_files) {
17549     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
17550 }
17551 else {
17552     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
17553     my $file_handle;
17554     if (! open $file_handle, "<", $file_list) {
17555         Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
17556         $glob_list = 1;
17557     }
17558     else {
17559         my @input;
17560
17561         # Read and parse mktables.lst, placing the results from the first part
17562         # into @input, and the second part into @mktables_list_output_files
17563         for my $list ( \@input, \@mktables_list_output_files ) {
17564             while (<$file_handle>) {
17565                 s/^ \s+ | \s+ $//xg;
17566                 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
17567                     $old_start_time = $1;
17568                 }
17569                 next if /^ \s* (?: \# .* )? $/x;
17570                 last if /^ =+ $/x;
17571                 my ( $file ) = split /\t/;
17572                 push @$list, $file;
17573             }
17574             @$list = uniques(@$list);
17575             next;
17576         }
17577
17578         # Look through all the input files
17579         foreach my $input (@input) {
17580             next if $input eq 'version'; # Already have checked this.
17581
17582             # Ignore if doesn't exist.  The checking about whether we care or
17583             # not is done via the Input_file object.
17584             next if ! file_exists($input);
17585
17586             # The paths are stored with relative names, and with '/' as the
17587             # delimiter; convert to absolute on this machine
17588             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
17589             $potential_files{lc $full} = 1
17590                 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
17591         }
17592     }
17593
17594     close $file_handle;
17595 }
17596
17597 if ($glob_list) {
17598
17599     # Here wants to process all .txt files in the directory structure.
17600     # Convert them to full path names.  They are stored in the platform's
17601     # relative style
17602     my @known_files;
17603     foreach my $object (@input_file_objects) {
17604         my $file = $object->file;
17605         next unless defined $file;
17606         push @known_files, File::Spec->rel2abs($file);
17607     }
17608
17609     my @unknown_input_files;
17610     foreach my $file (keys %potential_files) {  # The keys are stored in lc
17611         next if grep { $file eq lc($_) } @known_files;
17612
17613         # Here, the file is unknown to us.  Get relative path name
17614         $file = File::Spec->abs2rel($file);
17615         push @unknown_input_files, $file;
17616
17617         # What will happen is we create a data structure for it, and add it to
17618         # the list of input files to process.  First get the subdirectories
17619         # into an array
17620         my (undef, $directories, undef) = File::Spec->splitpath($file);
17621         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
17622         my @directories = File::Spec->splitdir($directories);
17623
17624         # If the file isn't extracted (meaning none of the directories is the
17625         # extracted one), just add it to the end of the list of inputs.
17626         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
17627             push @input_file_objects, Input_file->new($file, v0);
17628         }
17629         else {
17630
17631             # Here, the file is extracted.  It needs to go ahead of most other
17632             # processing.  Search for the first input file that isn't a
17633             # special required property (that is, find one whose first_release
17634             # is non-0), and isn't extracted.  Also, the Age property file is
17635             # processed before the extracted ones, just in case
17636             # $compare_versions is set.
17637             for (my $i = 0; $i < @input_file_objects; $i++) {
17638                 if ($input_file_objects[$i]->first_released ne v0
17639                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
17640                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
17641                 {
17642                     splice @input_file_objects, $i, 0,
17643                                                 Input_file->new($file, v0);
17644                     last;
17645                 }
17646             }
17647
17648         }
17649     }
17650     if (@unknown_input_files) {
17651         print STDERR simple_fold(join_lines(<<END
17652
17653 The following files are unknown as to how to handle.  Assuming they are
17654 typical property files.  You'll know by later error messages if it worked or
17655 not:
17656 END
17657         ) . " " . join(", ", @unknown_input_files) . "\n\n");
17658     }
17659 } # End of looking through directory structure for more .txt files.
17660
17661 # Create the list of input files from the objects we have defined, plus
17662 # version
17663 my @input_files = qw(version Makefile);
17664 foreach my $object (@input_file_objects) {
17665     my $file = $object->file;
17666     next if ! defined $file;    # Not all objects have files
17667     next if $object->optional && ! -e $file;
17668     push @input_files,  $file;
17669 }
17670
17671 if ( $verbosity >= $VERBOSE ) {
17672     print "Expecting ".scalar( @input_files )." input files. ",
17673          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
17674 }
17675
17676 # We set $most_recent to be the most recently changed input file, including
17677 # this program itself (done much earlier in this file)
17678 foreach my $in (@input_files) {
17679     next unless -e $in;        # Keep going even if missing a file
17680     my $mod_time = (stat $in)[9];
17681     $most_recent = $mod_time if $mod_time > $most_recent;
17682
17683     # See that the input files have distinct names, to warn someone if they
17684     # are adding a new one
17685     if ($make_list) {
17686         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
17687         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
17688         my @directories = File::Spec->splitdir($directories);
17689         my $base = $file =~ s/\.txt$//;
17690         construct_filename($file, 'mutable', \@directories);
17691     }
17692 }
17693
17694 # We use 'Makefile' just to see if it has changed since the last time we
17695 # rebuilt.  Now discard it.
17696 @input_files = grep { $_ ne 'Makefile' } @input_files;
17697
17698 my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
17699               || ! scalar @mktables_list_output_files  # or if no outputs known
17700               || $old_start_time < $most_recent;       # or out-of-date
17701
17702 # Now we check to see if any output files are older than youngest, if
17703 # they are, we need to continue on, otherwise we can presumably bail.
17704 if (! $rebuild) {
17705     foreach my $out (@mktables_list_output_files) {
17706         if ( ! file_exists($out)) {
17707             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
17708             $rebuild = 1;
17709             last;
17710          }
17711         #local $to_trace = 1 if main::DEBUG;
17712         trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
17713         if ( (stat $out)[9] <= $most_recent ) {
17714             #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
17715             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
17716             $rebuild = 1;
17717             last;
17718         }
17719     }
17720 }
17721 if (! $rebuild) {
17722     print "Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
17723     exit(0);
17724 }
17725 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
17726
17727 # Ready to do the major processing.  First create the perl pseudo-property.
17728 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
17729
17730 # Process each input file
17731 foreach my $file (@input_file_objects) {
17732     $file->run;
17733 }
17734
17735 # Finish the table generation.
17736
17737 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
17738 finish_Unicode();
17739
17740 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
17741 compile_perl();
17742
17743 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
17744 add_perl_synonyms();
17745
17746 print "Writing tables\n" if $verbosity >= $PROGRESS;
17747 write_all_tables();
17748
17749 # Write mktables.lst
17750 if ( $file_list and $make_list ) {
17751
17752     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
17753     foreach my $file (@input_files, @files_actually_output) {
17754         my (undef, $directories, $file) = File::Spec->splitpath($file);
17755         my @directories = File::Spec->splitdir($directories);
17756         $file = join '/', @directories, $file;
17757     }
17758
17759     my $ofh;
17760     if (! open $ofh,">",$file_list) {
17761         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
17762         return
17763     }
17764     else {
17765         my $localtime = localtime $start_time;
17766         print $ofh <<"END";
17767 #
17768 # $file_list -- File list for $0.
17769 #
17770 #   Autogenerated starting on $start_time ($localtime)
17771 #
17772 # - First section is input files
17773 #   ($0 itself is not listed but is automatically considered an input)
17774 # - Section separator is /^=+\$/
17775 # - Second section is a list of output files.
17776 # - Lines matching /^\\s*#/ are treated as comments
17777 #   which along with blank lines are ignored.
17778 #
17779
17780 # Input files:
17781
17782 END
17783         print $ofh "$_\n" for sort(@input_files);
17784         print $ofh "\n=================================\n# Output files:\n\n";
17785         print $ofh "$_\n" for sort @files_actually_output;
17786         print $ofh "\n# ",scalar(@input_files)," input files\n",
17787                 "# ",scalar(@files_actually_output)+1," output files\n\n",
17788                 "# End list\n";
17789         close $ofh
17790             or Carp::my_carp("Failed to close $ofh: $!");
17791
17792         print "Filelist has ",scalar(@input_files)," input files and ",
17793             scalar(@files_actually_output)+1," output files\n"
17794             if $verbosity >= $VERBOSE;
17795     }
17796 }
17797
17798 # Output these warnings unless -q explicitly specified.
17799 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
17800     if (@unhandled_properties) {
17801         print "\nProperties and tables that unexpectedly have no code points\n";
17802         foreach my $property (sort @unhandled_properties) {
17803             print $property, "\n";
17804         }
17805     }
17806
17807     if (%potential_files) {
17808         print "\nInput files that are not considered:\n";
17809         foreach my $file (sort keys %potential_files) {
17810             print File::Spec->abs2rel($file), "\n";
17811         }
17812     }
17813     print "\nAll done\n" if $verbosity >= $VERBOSE;
17814 }
17815 exit(0);
17816
17817 # TRAILING CODE IS USED BY make_property_test_script()
17818 __DATA__
17819
17820 use strict;
17821 use warnings;
17822
17823 # If run outside the normal test suite on an ASCII platform, you can
17824 # just create a latin1_to_native() function that just returns its
17825 # inputs, because that's the only function used from test.pl
17826 require "test.pl";
17827
17828 # Test qr/\X/ and the \p{} regular expression constructs.  This file is
17829 # constructed by mktables from the tables it generates, so if mktables is
17830 # buggy, this won't necessarily catch those bugs.  Tests are generated for all
17831 # feasible properties; a few aren't currently feasible; see
17832 # is_code_point_usable() in mktables for details.
17833
17834 # Standard test packages are not used because this manipulates SIG_WARN.  It
17835 # exits 0 if every non-skipped test succeeded; -1 if any failed.
17836
17837 my $Tests = 0;
17838 my $Fails = 0;
17839
17840 sub Expect($$$$) {
17841     my $expected = shift;
17842     my $ord = shift;
17843     my $regex  = shift;
17844     my $warning_type = shift;   # Type of warning message, like 'deprecated'
17845                                 # or empty if none
17846     my $line   = (caller)[2];
17847     $ord = ord(latin1_to_native(chr($ord)));
17848
17849     # Convert the code point to hex form
17850     my $string = sprintf "\"\\x{%04X}\"", $ord;
17851
17852     my @tests = "";
17853
17854     # The first time through, use all warnings.  If the input should generate
17855     # a warning, add another time through with them turned off
17856     push @tests, "no warnings '$warning_type';" if $warning_type;
17857
17858     foreach my $no_warnings (@tests) {
17859
17860         # Store any warning messages instead of outputting them
17861         local $SIG{__WARN__} = $SIG{__WARN__};
17862         my $warning_message;
17863         $SIG{__WARN__} = sub { $warning_message = $_[0] };
17864
17865         $Tests++;
17866
17867         # A string eval is needed because of the 'no warnings'.
17868         # Assumes no parens in the regular expression
17869         my $result = eval "$no_warnings
17870                             my \$RegObj = qr($regex);
17871                             $string =~ \$RegObj ? 1 : 0";
17872         if (not defined $result) {
17873             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
17874             $Fails++;
17875         }
17876         elsif ($result ^ $expected) {
17877             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
17878             $Fails++;
17879         }
17880         elsif ($warning_message) {
17881             if (! $warning_type || ($warning_type && $no_warnings)) {
17882                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
17883                 $Fails++;
17884             }
17885             else {
17886                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
17887             }
17888         }
17889         elsif ($warning_type && ! $no_warnings) {
17890             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
17891             $Fails++;
17892         }
17893         else {
17894             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
17895         }
17896     }
17897     return;
17898 }
17899
17900 sub Error($) {
17901     my $regex  = shift;
17902     $Tests++;
17903     if (eval { 'x' =~ qr/$regex/; 1 }) {
17904         $Fails++;
17905         my $line = (caller)[2];
17906         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
17907     }
17908     else {
17909         my $line = (caller)[2];
17910         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
17911     }
17912     return;
17913 }
17914
17915 # GCBTest.txt character that separates grapheme clusters
17916 my $breakable_utf8 = my $breakable = chr(0xF7);
17917 utf8::upgrade($breakable_utf8);
17918
17919 # GCBTest.txt character that indicates that the adjoining code points are part
17920 # of the same grapheme cluster
17921 my $nobreak_utf8 = my $nobreak = chr(0xD7);
17922 utf8::upgrade($nobreak_utf8);
17923
17924 sub Test_X($) {
17925     # Test qr/\X/ matches.  The input is a line from auxiliary/GCBTest.txt
17926     # Each such line is a sequence of code points given by their hex numbers,
17927     # separated by the two characters defined just before this subroutine that
17928     # indicate that either there can or cannot be a break between the adjacent
17929     # code points.  If there isn't a break, that means the sequence forms an
17930     # extended grapheme cluster, which means that \X should match the whole
17931     # thing.  If there is a break, \X should stop there.  This is all
17932     # converted by this routine into a match:
17933     #   $string =~ /(\X)/,
17934     # Each \X should match the next cluster; and that is what is checked.
17935
17936     my $template = shift;
17937
17938     my $line   = (caller)[2];
17939
17940     # The line contains characters above the ASCII range, but in Latin1.  It
17941     # may or may not be in utf8, and if it is, it may or may not know it.  So,
17942     # convert these characters to 8 bits.  If knows is in utf8, simply
17943     # downgrade.
17944     if (utf8::is_utf8($template)) {
17945         utf8::downgrade($template);
17946     } else {
17947
17948         # Otherwise, if it is in utf8, but doesn't know it, the next lines
17949         # convert the two problematic characters to their 8-bit equivalents.
17950         # If it isn't in utf8, they don't harm anything.
17951         use bytes;
17952         $template =~ s/$nobreak_utf8/$nobreak/g;
17953         $template =~ s/$breakable_utf8/$breakable/g;
17954     }
17955
17956     # Get rid of the leading and trailing breakables
17957     $template =~ s/^ \s* $breakable \s* //x;
17958     $template =~ s/ \s* $breakable \s* $ //x;
17959
17960     # And no-breaks become just a space.
17961     $template =~ s/ \s* $nobreak \s* / /xg;
17962
17963     # Split the input into segments that are breakable between them.
17964     my @segments = split /\s*$breakable\s*/, $template;
17965
17966     my $string = "";
17967     my $display_string = "";
17968     my @should_match;
17969     my @should_display;
17970
17971     # Convert the code point sequence in each segment into a Perl string of
17972     # characters
17973     foreach my $segment (@segments) {
17974         my @code_points = split /\s+/, $segment;
17975         my $this_string = "";
17976         my $this_display = "";
17977         foreach my $code_point (@code_points) {
17978             $this_string .= latin1_to_native(chr(hex $code_point));
17979             $this_display .= "\\x{$code_point}";
17980         }
17981
17982         # The next cluster should match the string in this segment.
17983         push @should_match, $this_string;
17984         push @should_display, $this_display;
17985         $string .= $this_string;
17986         $display_string .= $this_display;
17987     }
17988
17989     # If a string can be represented in both non-ut8 and utf8, test both cases
17990     UPGRADE:
17991     for my $to_upgrade (0 .. 1) {
17992
17993         if ($to_upgrade) {
17994
17995             # If already in utf8, would just be a repeat
17996             next UPGRADE if utf8::is_utf8($string);
17997
17998             utf8::upgrade($string);
17999         }
18000
18001         # Finally, do the \X match.
18002         my @matches = $string =~ /(\X)/g;
18003
18004         # Look through each matched cluster to verify that it matches what we
18005         # expect.
18006         my $min = (@matches < @should_match) ? @matches : @should_match;
18007         for my $i (0 .. $min - 1) {
18008             $Tests++;
18009             if ($matches[$i] eq $should_match[$i]) {
18010                 print "ok $Tests - ";
18011                 if ($i == 0) {
18012                     print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
18013                 } else {
18014                     print "And \\X #", $i + 1,
18015                 }
18016                 print " correctly matched $should_display[$i]; line $line\n";
18017             } else {
18018                 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
18019                                                     unpack("U*", $matches[$i]));
18020                 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
18021                     $i + 1,
18022                     " should have matched $should_display[$i]",
18023                     " but instead matched $matches[$i]",
18024                     ".  Abandoning rest of line $line\n";
18025                 next UPGRADE;
18026             }
18027         }
18028
18029         # And the number of matches should equal the number of expected matches.
18030         $Tests++;
18031         if (@matches == @should_match) {
18032             print "ok $Tests - Nothing was left over; line $line\n";
18033         } else {
18034             print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
18035         }
18036     }
18037
18038     return;
18039 }
18040
18041 sub Finished() {
18042     print "1..$Tests\n";
18043     exit($Fails ? -1 : 0);
18044 }
18045
18046 Error('\p{Script=InGreek}');    # Bug #69018
18047 Test_X("1100 $nobreak 1161");  # Bug #70940
18048 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18049 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18050 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726