lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # Needs 'no overloading' to run faster on miniperl.  Code commented out at the
   8 # subroutine objaddr can be used instead to work as far back (untested) as
   9 # 5.8: needs pack "U".  But almost all occurrences of objaddr have been
  10 # removed in favor of using 'no overloading'.  You also would have to go
  11 # through and replace occurrences like:
  12 #       my $addr = do { no overloading; pack 'J', $self; }
  13 # with
  14 #       my $addr = main::objaddr $self;
  15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
  16 # that instituted the change to main::objaddr, and subsequent commits that
  17 # changed 0+$self to pack 'J', $self.)
  18
  19 my $start_time;
  20 BEGIN { # Get the time the script started running; do it at compilation to
  21         # get it as close as possible
  22     $start_time= time;
  23 }
  24
  25 require 5.010_001;
  26 use strict;
  27 use warnings;
  28 use Carp;
  29 use Config;
  30 use File::Find;
  31 use File::Path;
  32 use File::Spec;
  33 use Text::Tabs;
  34 use re "/aa";
  35
  36 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  38
  39 ##########################################################################
  40 #
  41 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  42 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  43 # a pod file and .t files, depending on option parameters.
  44 #
  45 # The structure of this file is:
  46 #   First these introductory comments; then
  47 #   code needed for everywhere, such as debugging stuff; then
  48 #   code to handle input parameters; then
  49 #   data structures likely to be of external interest (some of which depend on
  50 #       the input parameters, so follows them; then
  51 #   more data structures and subroutine and package (class) definitions; then
  52 #   the small actual loop to process the input files and finish up; then
  53 #   a __DATA__ section, for the .t tests
  54 #
  55 # This program works on all releases of Unicode so far.  The outputs have been
  56 # scrutinized most intently for release 5.1.  The others have been checked for
  57 # somewhat more than just sanity.  It can handle all non-provisional Unicode
  58 # character properties in those releases.
  59 #
  60 # This program is mostly about Unicode character (or code point) properties.
  61 # A property describes some attribute or quality of a code point, like if it
  62 # is lowercase or not, its name, what version of Unicode it was first defined
  63 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  64 # possibilities by making all properties into mappings from each code point
  65 # into some corresponding value.  In the case of it being lowercase or not,
  66 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  67 # property maps each Unicode code point to a single value, called a "property
  68 # value".  (Some more recently defined properties, map a code point to a set
  69 # of values.)
  70 #
  71 # When using a property in a regular expression, what is desired isn't the
  72 # mapping of the code point to its property's value, but the reverse (or the
  73 # mathematical "inverse relation"): starting with the property value, "Does a
  74 # code point map to it?"  These are written in a "compound" form:
  75 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  76 # files containing the lists of code points that map to each such regular
  77 # expression property value, one file per list
  78 #
  79 # There is also a single form shortcut that Perl adds for many of the commonly
  80 # used properties.  This happens for all binary properties, plus script,
  81 # general_category, and block properties.
  82 #
  83 # Thus the outputs of this program are files.  There are map files, mostly in
  84 # the 'To' directory; and there are list files for use in regular expression
  85 # matching, all in subdirectories of the 'lib' directory, with each
  86 # subdirectory being named for the property that the lists in it are for.
  87 # Bookkeeping, test, and documentation files are also generated.
  88
  89 my $matches_directory = 'lib';   # Where match (\p{}) files go.
  90 my $map_directory = 'To';        # Where map files go.
  91
  92 # DATA STRUCTURES
  93 #
  94 # The major data structures of this program are Property, of course, but also
  95 # Table.  There are two kinds of tables, very similar to each other.
  96 # "Match_Table" is the data structure giving the list of code points that have
  97 # a particular property value, mentioned above.  There is also a "Map_Table"
  98 # data structure which gives the property's mapping from code point to value.
  99 # There are two structures because the match tables need to be combined in
 100 # various ways, such as constructing unions, intersections, complements, etc.,
 101 # and the map ones don't.  And there would be problems, perhaps subtle, if
 102 # a map table were inadvertently operated on in some of those ways.
 103 # The use of separate classes with operations defined on one but not the other
 104 # prevents accidentally confusing the two.
 105 #
 106 # At the heart of each table's data structure is a "Range_List", which is just
 107 # an ordered list of "Ranges", plus ancillary information, and methods to
 108 # operate on them.  A Range is a compact way to store property information.
 109 # Each range has a starting code point, an ending code point, and a value that
 110 # is meant to apply to all the code points between the two end points,
 111 # inclusive.  For a map table, this value is the property value for those
 112 # code points.  Two such ranges could be written like this:
 113 #   0x41 .. 0x5A, 'Upper',
 114 #   0x61 .. 0x7A, 'Lower'
 115 #
 116 # Each range also has a type used as a convenience to classify the values.
 117 # Most ranges in this program will be Type 0, or normal, but there are some
 118 # ranges that have a non-zero type.  These are used only in map tables, and
 119 # are for mappings that don't fit into the normal scheme of things.  Mappings
 120 # that require a hash entry to communicate with utf8.c are one example;
 121 # another example is mappings for charnames.pm to use which indicate a name
 122 # that is algorithmically determinable from its code point (and the reverse).
 123 # These are used to significantly compact these tables, instead of listing
 124 # each one of the tens of thousands individually.
 125 #
 126 # In a match table, the value of a range is irrelevant (and hence the type as
 127 # well, which will always be 0), and arbitrarily set to the null string.
 128 # Using the example above, there would be two match tables for those two
 129 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 130 # named Lower would contain 0x61..0x7A.
 131 #
 132 # Actually, there are two types of range lists, "Range_Map" is the one
 133 # associated with map tables, and "Range_List" with match tables.
 134 # Again, this is so that methods can be defined on one and not the others so
 135 # as to prevent operating on them in incorrect ways.
 136 #
 137 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 138 # in the perl core.  All tables could in theory be written, but some are
 139 # suppressed because there is no current practical use for them.  It is easy
 140 # to change which get written by changing various lists that are near the top
 141 # of the actual code in this file.  The table data structures contain enough
 142 # ancillary information to allow them to be treated as separate entities for
 143 # writing, such as the path to each one's file.  There is a heading in each
 144 # map table that gives the format of its entries, and what the map is for all
 145 # the code points missing from it.  (This allows tables to be more compact.)
 146 #
 147 # The Property data structure contains one or more tables.  All properties
 148 # contain a map table (except the $perl property which is a
 149 # pseudo-property containing only match tables), and any properties that
 150 # are usable in regular expression matches also contain various matching
 151 # tables, one for each value the property can have.  A binary property can
 152 # have two values, True and False (or Y and N, which are preferred by Unicode
 153 # terminology).  Thus each of these properties will have a map table that
 154 # takes every code point and maps it to Y or N (but having ranges cuts the
 155 # number of entries in that table way down), and two match tables, one
 156 # which has a list of all the code points that map to Y, and one for all the
 157 # code points that map to N.  (For each binary property, a third table is also
 158 # generated for the pseudo Perl property.  It contains the identical code
 159 # points as the Y table, but can be written in regular expressions, not in the
 160 # compound form, but in a "single" form like \p{IsUppercase}.)  Many
 161 # properties are binary, but some properties have several possible values,
 162 # some have many, and properties like Name have a different value for every
 163 # named code point.  Those will not, unless the controlling lists are changed,
 164 # have their match tables written out.  But all the ones which can be used in
 165 # regular expression \p{} and \P{} constructs will.  Prior to 5.14, generally
 166 # a property would have either its map table or its match tables written but
 167 # not both.  Again, what gets written is controlled by lists which can easily
 168 # be changed.  Starting in 5.14, advantage was taken of this, and all the map
 169 # tables needed to reconstruct the Unicode db are now written out, while
 170 # suppressing the Unicode .txt files that contain the data.  Our tables are
 171 # much more compact than the .txt files, so a significant space savings was
 172 # achieved.  Also, tables are not written out that are trivially derivable
 173 # from tables that do get written.  So, there typically is no file containing
 174 # the code points not matched by a binary property (the table for \P{} versus
 175 # lowercase \p{}), since you just need to invert the True table to get the
 176 # False table.
 177
 178 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
 179 # how many match tables there are and the content of the maps.  This 'Type' is
 180 # different than a range 'Type', so don't get confused by the two concepts
 181 # having the same name.
 182 #
 183 # For information about the Unicode properties, see Unicode's UAX44 document:
 184
 185 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 186
 187 # As stated earlier, this program will work on any release of Unicode so far.
 188 # Most obvious problems in earlier data have NOT been corrected except when
 189 # necessary to make Perl or this program work reasonably, and to keep out
 190 # potential security issues.  For example, no folding information was given in
 191 # early releases, so this program substitutes lower case instead, just so that
 192 # a regular expression with the /i option will do something that actually
 193 # gives the right results in many cases.  There are also a couple other
 194 # corrections for version 1.1.5, commented at the point they are made.  As an
 195 # example of corrections that weren't made (but could be) is this statement
 196 # from DerivedAge.txt: "The supplementary private use code points and the
 197 # non-character code points were assigned in version 2.0, but not specifically
 198 # listed in the UCD until versions 3.0 and 3.1 respectively."  (To be precise
 199 # it was 3.0.1 not 3.0.0)  More information on Unicode version glitches is
 200 # further down in these introductory comments.
 201 #
 202 # This program works on all non-provisional properties as of the current
 203 # Unicode release, though the files for some are suppressed for various
 204 # reasons.  You can change which are output by changing lists in this program.
 205 #
 206 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
 207 # loose matchings rules (from Unicode TR18):
 208 #
 209 #    The recommended names for UCD properties and property values are in
 210 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 211 #    [PropValue]. There are both abbreviated names and longer, more
 212 #    descriptive names. It is strongly recommended that both names be
 213 #    recognized, and that loose matching of property names be used,
 214 #    whereby the case distinctions, whitespace, hyphens, and underbar
 215 #    are ignored.
 216 #
 217 # The program still allows Fuzzy to override its determination of if loose
 218 # matching should be used, but it isn't currently used, as it is no longer
 219 # needed; the calculations it makes are good enough.
 220 #
 221 # SUMMARY OF HOW IT WORKS:
 222 #
 223 #   Process arguments
 224 #
 225 #   A list is constructed containing each input file that is to be processed
 226 #
 227 #   Each file on the list is processed in a loop, using the associated handler
 228 #   code for each:
 229 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 230 #            first.  These files name the properties and property values.
 231 #            Objects are created of all the property and property value names
 232 #            that the rest of the input should expect, including all synonyms.
 233 #        The other input files give mappings from properties to property
 234 #           values.  That is, they list code points and say what the mapping
 235 #           is under the given property.  Some files give the mappings for
 236 #           just one property; and some for many.  This program goes through
 237 #           each file and populates the properties and their map tables from
 238 #           them.  Some properties are listed in more than one file, and
 239 #           Unicode has set up a precedence as to which has priority if there
 240 #           is a conflict.  Thus the order of processing matters, and this
 241 #           program handles the conflict possibility by processing the
 242 #           overriding input files last, so that if necessary they replace
 243 #           earlier values.
 244 #        After this is all done, the program creates the property mappings not
 245 #            furnished by Unicode, but derivable from what it does give.
 246 #        The tables of code points that match each property value in each
 247 #            property that is accessible by regular expressions are created.
 248 #        The Perl-defined properties are created and populated.  Many of these
 249 #            require data determined from the earlier steps
 250 #        Any Perl-defined synonyms are created, and name clashes between Perl
 251 #            and Unicode are reconciled and warned about.
 252 #        All the properties are written to files
 253 #        Any other files are written, and final warnings issued.
 254 #
 255 # For clarity, a number of operators have been overloaded to work on tables:
 256 #   ~ means invert (take all characters not in the set).  The more
 257 #       conventional '!' is not used because of the possibility of confusing
 258 #       it with the actual boolean operation.
 259 #   + means union
 260 #   - means subtraction
 261 #   & means intersection
 262 # The precedence of these is the order listed.  Parentheses should be
 263 # copiously used.  These are not a general scheme.  The operations aren't
 264 # defined for a number of things, deliberately, to avoid getting into trouble.
 265 # Operations are done on references and affect the underlying structures, so
 266 # that the copy constructors for them have been overloaded to not return a new
 267 # clone, but the input object itself.
 268 #
 269 # The bool operator is deliberately not overloaded to avoid confusion with
 270 # "should it mean if the object merely exists, or also is non-empty?".
 271 #
 272 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 273 #
 274 # This program needs to be able to run under miniperl.  Therefore, it uses a
 275 # minimum of other modules, and hence implements some things itself that could
 276 # be gotten from CPAN
 277 #
 278 # This program uses inputs published by the Unicode Consortium.  These can
 279 # change incompatibly between releases without the Perl maintainers realizing
 280 # it.  Therefore this program is now designed to try to flag these.  It looks
 281 # at the directories where the inputs are, and flags any unrecognized files.
 282 # It keeps track of all the properties in the files it handles, and flags any
 283 # that it doesn't know how to handle.  It also flags any input lines that
 284 # don't match the expected syntax, among other checks.
 285 #
 286 # It is also designed so if a new input file matches one of the known
 287 # templates, one hopefully just needs to add it to a list to have it
 288 # processed.
 289 #
 290 # As mentioned earlier, some properties are given in more than one file.  In
 291 # particular, the files in the extracted directory are supposedly just
 292 # reformattings of the others.  But they contain information not easily
 293 # derivable from the other files, including results for Unihan, which this
 294 # program doesn't ordinarily look at, and for unassigned code points.  They
 295 # also have historically had errors or been incomplete.  In an attempt to
 296 # create the best possible data, this program thus processes them first to
 297 # glean information missing from the other files; then processes those other
 298 # files to override any errors in the extracted ones.  Much of the design was
 299 # driven by this need to store things and then possibly override them.
 300 #
 301 # It tries to keep fatal errors to a minimum, to generate something usable for
 302 # testing purposes.  It always looks for files that could be inputs, and will
 303 # warn about any that it doesn't know how to handle (the -q option suppresses
 304 # the warning).
 305 #
 306 # Why is there more than one type of range?
 307 #   This simplified things.  There are some very specialized code points that
 308 #   have to be handled specially for output, such as Hangul syllable names.
 309 #   By creating a range type (done late in the development process), it
 310 #   allowed this to be stored with the range, and overridden by other input.
 311 #   Originally these were stored in another data structure, and it became a
 312 #   mess trying to decide if a second file that was for the same property was
 313 #   overriding the earlier one or not.
 314 #
 315 # Why are there two kinds of tables, match and map?
 316 #   (And there is a base class shared by the two as well.)  As stated above,
 317 #   they actually are for different things.  Development proceeded much more
 318 #   smoothly when I (khw) realized the distinction.  Map tables are used to
 319 #   give the property value for every code point (actually every code point
 320 #   that doesn't map to a default value).  Match tables are used for regular
 321 #   expression matches, and are essentially the inverse mapping.  Separating
 322 #   the two allows more specialized methods, and error checks so that one
 323 #   can't just take the intersection of two map tables, for example, as that
 324 #   is nonsensical.
 325 #
 326 # What about 'fate' and 'status'.  The concept of a table's fate was created
 327 #   late when it became clear that something more was needed.  The difference
 328 #   between this and 'status' is unclean, and could be improved if someone
 329 #   wanted to spend the effort.
 330 #
 331 # DEBUGGING
 332 #
 333 # This program is written so it will run under miniperl.  Occasionally changes
 334 # will cause an error where the backtrace doesn't work well under miniperl.
 335 # To diagnose the problem, you can instead run it under regular perl, if you
 336 # have one compiled.
 337 #
 338 # There is a good trace facility.  To enable it, first sub DEBUG must be set
 339 # to return true.  Then a line like
 340 #
 341 # local $to_trace = 1 if main::DEBUG;
 342 #
 343 # can be added to enable tracing in its lexical scope (plus dynamic) or until
 344 # you insert another line:
 345 #
 346 # local $to_trace = 0 if main::DEBUG;
 347 #
 348 # To actually trace, use a line like "trace $a, @b, %c, ...;
 349 #
 350 # Some of the more complex subroutines already have trace statements in them.
 351 # Permanent trace statements should be like:
 352 #
 353 # trace ... if main::DEBUG && $to_trace;
 354 #
 355 # If there is just one or a few files that you're debugging, you can easily
 356 # cause most everything else to be skipped.  Change the line
 357 #
 358 # my $debug_skip = 0;
 359 #
 360 # to 1, and every file whose object is in @input_file_objects and doesn't have
 361 # a, 'non_skip => 1,' in its constructor will be skipped.  However, skipping
 362 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
 363 #
 364 # To compare the output tables, it may be useful to specify the -annotate
 365 # flag.  This causes the tables to expand so there is one entry for each
 366 # non-algorithmically named code point giving, currently its name, and its
 367 # graphic representation if printable (and you have a font that knows about
 368 # it).  This makes it easier to see what the particular code points are in
 369 # each output table.  The tables are usable, but because they don't have
 370 # ranges (for the most part), a Perl using them will run slower.  Non-named
 371 # code points are annotated with a description of their status, and contiguous
 372 # ones with the same description will be output as a range rather than
 373 # individually.  Algorithmically named characters are also output as ranges,
 374 # except when there are just a few contiguous ones.
 375 #
 376 # FUTURE ISSUES
 377 #
 378 # The program would break if Unicode were to change its names so that
 379 # interior white space, underscores, or dashes differences were significant
 380 # within property and property value names.
 381 #
 382 # It might be easier to use the xml versions of the UCD if this program ever
 383 # would need heavy revision, and the ability to handle old versions was not
 384 # required.
 385 #
 386 # There is the potential for name collisions, in that Perl has chosen names
 387 # that Unicode could decide it also likes.  There have been such collisions in
 388 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 389 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 390 # collisions, which were withdrawn before the final release, because of Perl's
 391 # and other's protests.  These all involved new properties which began with
 392 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 393 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 394 # Unicode document, so they are unlikely to be used by Unicode for another
 395 # purpose.  However, they might try something beginning with 'In', or use any
 396 # of the other Perl-defined properties.  This program will warn you of name
 397 # collisions, and refuse to generate tables with them, but manual intervention
 398 # will be required in this event.  One scheme that could be implemented, if
 399 # necessary, would be to have this program generate another file, or add a
 400 # field to mktables.lst that gives the date of first definition of a property.
 401 # Each new release of Unicode would use that file as a basis for the next
 402 # iteration.  And the Perl synonym addition code could sort based on the age
 403 # of the property, so older properties get priority, and newer ones that clash
 404 # would be refused; hence existing code would not be impacted, and some other
 405 # synonym would have to be used for the new property.  This is ugly, and
 406 # manual intervention would certainly be easier to do in the short run; lets
 407 # hope it never comes to this.
 408 #
 409 # A NOTE ON UNIHAN
 410 #
 411 # This program can generate tables from the Unihan database.  But it doesn't
 412 # by default, letting the CPAN module Unicode::Unihan handle them.  Prior to
 413 # version 5.2, this database was in a single file, Unihan.txt.  In 5.2 the
 414 # database was split into 8 different files, all beginning with the letters
 415 # 'Unihan'.  This program will read those file(s) if present, but it needs to
 416 # know which of the many properties in the file(s) should have tables created
 417 # for them.  It will create tables for any properties listed in
 418 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
 419 # @cjk_properties array and the @cjk_property_values array.  Thus, if a
 420 # property you want is not in those files of the release you are building
 421 # against, you must add it to those two arrays.  Starting in 4.0, the
 422 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
 423 # is present in the directory, a table will be generated for that property.
 424 # In 5.2, several more properties were added.  For your convenience, the two
 425 # arrays are initialized with all the 6.0 listed properties that are also in
 426 # earlier releases.  But these are commented out.  You can just uncomment the
 427 # ones you want, or use them as a template for adding entries for other
 428 # properties.
 429 #
 430 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 431 # and filter_unihan_line() are the functions where this is done.  This program
 432 # already does some adjusting to make the lines look more like the rest of the
 433 # Unicode DB;  You can see what that is in filter_unihan_line()
 434 #
 435 # There is a bug in the 3.2 data file in which some values for the
 436 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 437 # could be added for these; or for a particular installation, the Unihan.txt
 438 # file could be edited to fix them.
 439 #
 440 # HOW TO ADD A FILE TO BE PROCESSED
 441 #
 442 # A new file from Unicode needs to have an object constructed for it in
 443 # @input_file_objects, probably at the end or at the end of the extracted
 444 # ones.  The program should warn you if its name will clash with others on
 445 # restrictive file systems, like DOS.  If so, figure out a better name, and
 446 # add lines to the README.perl file giving that.  If the file is a character
 447 # property, it should be in the format that Unicode has implicitly
 448 # standardized for such files for the more recently introduced ones.
 449 # If so, the Input_file constructor for @input_file_objects can just be the
 450 # file name and release it first appeared in.  If not, then it should be
 451 # possible to construct an each_line_handler() to massage the line into the
 452 # standardized form.
 453 #
 454 # For non-character properties, more code will be needed.  You can look at
 455 # the existing entries for clues.
 456 #
 457 # UNICODE VERSIONS NOTES
 458 #
 459 # The Unicode UCD has had a number of errors in it over the versions.  And
 460 # these remain, by policy, in the standard for that version.  Therefore it is
 461 # risky to correct them, because code may be expecting the error.  So this
 462 # program doesn't generally make changes, unless the error breaks the Perl
 463 # core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
 464 # for U+1105, which causes real problems for the algorithms for Jamo
 465 # calculations, so it is changed here.
 466 #
 467 # But it isn't so clear cut as to what to do about concepts that are
 468 # introduced in a later release; should they extend back to earlier releases
 469 # where the concept just didn't exist?  It was easier to do this than to not,
 470 # so that's what was done.  For example, the default value for code points not
 471 # in the files for various properties was probably undefined until changed by
 472 # some version.  No_Block for blocks is such an example.  This program will
 473 # assign No_Block even in Unicode versions that didn't have it.  This has the
 474 # benefit that code being written doesn't have to special case earlier
 475 # versions; and the detriment that it doesn't match the Standard precisely for
 476 # the affected versions.
 477 #
 478 # Here are some observations about some of the issues in early versions:
 479 #
 480 # Prior to version 3.0, there were 3 character decompositions.  These are not
 481 # handled by Unicode::Normalize, nor will it compile when presented a version
 482 # that has them.  However, you can trivially get it to compile by simply
 483 # ignoring those decompositions, by changing the croak to a carp.  At the time
 484 # of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
 485 #
 486 #   croak("Weird Canonical Decomposition of U+$h");
 487 #
 488 # Simply change to a carp.  It will compile, but will not know about any three
 489 # character decomposition.
 490
 491 # The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
 492 # that the reason is that the CJK block starting at 4E00 was removed from
 493 # PropList, and was not put back in until 3.1.0.  The Perl extension (the
 494 # single property name \p{alpha}) has the correct values.  But the compound
 495 # form is simply not generated until 3.1, as it can be argued that prior to
 496 # this release, this was not an official property.  The comments for
 497 # filter_old_style_proplist() give more details.
 498 #
 499 # Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
 500 # always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
 501 # reason is that 3.2 introduced U+205F=medium math space, which was not
 502 # classed as white space, but Perl figured out that it should have been. 4.0
 503 # reclassified it correctly.
 504 #
 505 # Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
 506 # this was erroneously a synonym for 202 (it should be 200).  In 4.0, ATB
 507 # became 202, and ATBL was left with no code points, as all the ones that
 508 # mapped to 202 stayed mapped to 202.  Thus if your program used the numeric
 509 # name for the class, it would not have been affected, but if it used the
 510 # mnemonic, it would have been.
 511 #
 512 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that code
 513 # points which eventually came to have this script property value, instead
 514 # mapped to "Unknown".  But in the next release all these code points were
 515 # moved to \p{sc=common} instead.
 516 #
 517 # The default for missing code points for BidiClass is complicated.  Starting
 518 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 519 # tries to do the best it can for earlier releases.  It is done in
 520 # process_PropertyAliases()
 521 #
 522 # In version 2.1.2, the entry in UnicodeData.txt:
 523 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
 524 # should instead be
 525 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
 526 # Without this change, there are casing problems for this character.
 527 #
 528 ##############################################################################
 529
 530 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 531                         # and errors
 532 my $MAX_LINE_WIDTH = 78;
 533
 534 # Debugging aid to skip most files so as to not be distracted by them when
 535 # concentrating on the ones being debugged.  Add
 536 # non_skip => 1,
 537 # to the constructor for those files you want processed when you set this.
 538 # Files with a first version number of 0 are special: they are always
 539 # processed regardless of the state of this flag.  Generally, Jamo.txt and
 540 # UnicodeData.txt must not be skipped if you want this program to not die
 541 # before normal completion.
 542 my $debug_skip = 0;
 543
 544
 545 # Normally these are suppressed.
 546 my $write_Unicode_deprecated_tables = 0;
 547
 548 # Set to 1 to enable tracing.
 549 our $to_trace = 0;
 550
 551 { # Closure for trace: debugging aid
 552     my $print_caller = 1;        # ? Include calling subroutine name
 553     my $main_with_colon = 'main::';
 554     my $main_colon_length = length($main_with_colon);
 555
 556     sub trace {
 557         return unless $to_trace;        # Do nothing if global flag not set
 558
 559         my @input = @_;
 560
 561         local $DB::trace = 0;
 562         $DB::trace = 0;          # Quiet 'used only once' message
 563
 564         my $line_number;
 565
 566         # Loop looking up the stack to get the first non-trace caller
 567         my $caller_line;
 568         my $caller_name;
 569         my $i = 0;
 570         do {
 571             $line_number = $caller_line;
 572             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 573             $caller = $main_with_colon unless defined $caller;
 574
 575             $caller_name = $caller;
 576
 577             # get rid of pkg
 578             $caller_name =~ s/.*:://;
 579             if (substr($caller_name, 0, $main_colon_length)
 580                 eq $main_with_colon)
 581             {
 582                 $caller_name = substr($caller_name, $main_colon_length);
 583             }
 584
 585         } until ($caller_name ne 'trace');
 586
 587         # If the stack was empty, we were called from the top level
 588         $caller_name = 'main' if ($caller_name eq ""
 589                                     || $caller_name eq 'trace');
 590
 591         my $output = "";
 592         foreach my $string (@input) {
 593             #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 594             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 595                 $output .= simple_dumper($string);
 596             }
 597             else {
 598                 $string = "$string" if ref $string;
 599                 $string = $UNDEF unless defined $string;
 600                 chomp $string;
 601                 $string = '""' if $string eq "";
 602                 $output .= " " if $output ne ""
 603                                 && $string ne ""
 604                                 && substr($output, -1, 1) ne " "
 605                                 && substr($string, 0, 1) ne " ";
 606                 $output .= $string;
 607             }
 608         }
 609
 610         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 611         print STDERR "$caller_name: " if $print_caller;
 612         print STDERR $output, "\n";
 613         return;
 614     }
 615 }
 616
 617 # This is for a rarely used development feature that allows you to compare two
 618 # versions of the Unicode standard without having to deal with changes caused
 619 # by the code points introduced in the later version.  Change the 0 to a
 620 # string containing a SINGLE dotted Unicode release number (e.g. "2.1").  Only
 621 # code points introduced in that release and earlier will be used; later ones
 622 # are thrown away.  You use the version number of the earliest one you want to
 623 # compare; then run this program on directory structures containing each
 624 # release, and compare the outputs.  These outputs will therefore include only
 625 # the code points common to both releases, and you can see the changes caused
 626 # just by the underlying release semantic changes.  For versions earlier than
 627 # 3.2, you must copy a version of DAge.txt into the directory.
 628 my $string_compare_versions = DEBUG && 0; #  e.g., "2.1";
 629 my $compare_versions = DEBUG
 630                        && $string_compare_versions
 631                        && pack "C*", split /\./, $string_compare_versions;
 632
 633 sub uniques {
 634     # Returns non-duplicated input values.  From "Perl Best Practices:
 635     # Encapsulated Cleverness".  p. 455 in first edition.
 636
 637     my %seen;
 638     # Arguably this breaks encapsulation, if the goal is to permit multiple
 639     # distinct objects to stringify to the same value, and be interchangeable.
 640     # However, for this program, no two objects stringify identically, and all
 641     # lists passed to this function are either objects or strings. So this
 642     # doesn't affect correctness, but it does give a couple of percent speedup.
 643     no overloading;
 644     return grep { ! $seen{$_}++ } @_;
 645 }
 646
 647 $0 = File::Spec->canonpath($0);
 648
 649 my $make_test_script = 0;      # ? Should we output a test script
 650 my $make_norm_test_script = 0; # ? Should we output a normalization test script
 651 my $write_unchanged_files = 0; # ? Should we update the output files even if
 652                                #    we don't think they have changed
 653 my $use_directory = "";        # ? Should we chdir somewhere.
 654 my $pod_directory;             # input directory to store the pod file.
 655 my $pod_file = 'perluniprops';
 656 my $t_path;                     # Path to the .t test file
 657 my $file_list = 'mktables.lst'; # File to store input and output file names.
 658                                # This is used to speed up the build, by not
 659                                # executing the main body of the program if
 660                                # nothing on the list has changed since the
 661                                # previous build
 662 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 663                                # make a list so that when the pumpking is
 664                                # preparing a release, s/he won't have to do
 665                                # special things
 666 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 667                                # in the input.
 668 my $output_range_counts = $debugging_build;   # ? Should we include the number
 669                                               # of code points in ranges in
 670                                               # the output
 671 my $annotate = 0;              # ? Should character names be in the output
 672
 673 # Verbosity levels; 0 is quiet
 674 my $NORMAL_VERBOSITY = 1;
 675 my $PROGRESS = 2;
 676 my $VERBOSE = 3;
 677
 678 my $verbosity = $NORMAL_VERBOSITY;
 679
 680 # Process arguments
 681 while (@ARGV) {
 682     my $arg = shift @ARGV;
 683     if ($arg eq '-v') {
 684         $verbosity = $VERBOSE;
 685     }
 686     elsif ($arg eq '-p') {
 687         $verbosity = $PROGRESS;
 688         $| = 1;     # Flush buffers as we go.
 689     }
 690     elsif ($arg eq '-q') {
 691         $verbosity = 0;
 692     }
 693     elsif ($arg eq '-w') {
 694         $write_unchanged_files = 1; # update the files even if havent changed
 695     }
 696     elsif ($arg eq '-check') {
 697         my $this = shift @ARGV;
 698         my $ok = shift @ARGV;
 699         if ($this ne $ok) {
 700             print "Skipping as check params are not the same.\n";
 701             exit(0);
 702         }
 703     }
 704     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 705         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 706     }
 707     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 708     {
 709         $make_test_script = 1;
 710     }
 711     elsif ($arg eq '-makenormtest')
 712     {
 713         $make_norm_test_script = 1;
 714     }
 715     elsif ($arg eq '-makelist') {
 716         $make_list = 1;
 717     }
 718     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 719         -d $use_directory or croak "Unknown directory '$use_directory'";
 720     }
 721     elsif ($arg eq '-L') {
 722
 723         # Existence not tested until have chdir'd
 724         $file_list = shift;
 725     }
 726     elsif ($arg eq '-globlist') {
 727         $glob_list = 1;
 728     }
 729     elsif ($arg eq '-c') {
 730         $output_range_counts = ! $output_range_counts
 731     }
 732     elsif ($arg eq '-annotate') {
 733         $annotate = 1;
 734         $debugging_build = 1;
 735         $output_range_counts = 1;
 736     }
 737     else {
 738         my $with_c = 'with';
 739         $with_c .= 'out' if $output_range_counts;   # Complements the state
 740         croak <<END;
 741 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 742           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 743           [-check A B ]
 744   -c          : Output comments $with_c number of code points in ranges
 745   -q          : Quiet Mode: Only output serious warnings.
 746   -p          : Set verbosity level to normal plus show progress.
 747   -v          : Set Verbosity level high:  Show progress and non-serious
 748                 warnings
 749   -w          : Write files regardless
 750   -C dir      : Change to this directory before proceeding. All relative paths
 751                 except those specified by the -P and -T options will be done
 752                 with respect to this directory.
 753   -P dir      : Output $pod_file file to directory 'dir'.
 754   -T path     : Create a test script as 'path'; overrides -maketest
 755   -L filelist : Use alternate 'filelist' instead of standard one
 756   -globlist   : Take as input all non-Test *.txt files in current and sub
 757                 directories
 758   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 759                 overrides -T
 760   -makelist   : Rewrite the file list $file_list based on current setup
 761   -annotate   : Output an annotation for each character in the table files;
 762                 useful for debugging mktables, looking at diffs; but is slow,
 763                 memory intensive; resulting tables are usable but are slow and
 764                 very large (and currently fail the Unicode::UCD.t tests).
 765   -check A B  : Executes $0 only if A and B are the same
 766 END
 767     }
 768 }
 769
 770 # Stores the most-recently changed file.  If none have changed, can skip the
 771 # build
 772 my $most_recent = (stat $0)[9];   # Do this before the chdir!
 773
 774 # Change directories now, because need to read 'version' early.
 775 if ($use_directory) {
 776     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 777         $pod_directory = File::Spec->rel2abs($pod_directory);
 778     }
 779     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 780         $t_path = File::Spec->rel2abs($t_path);
 781     }
 782     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 783     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 784         $pod_directory = File::Spec->abs2rel($pod_directory);
 785     }
 786     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 787         $t_path = File::Spec->abs2rel($t_path);
 788     }
 789 }
 790
 791 # Get Unicode version into regular and v-string.  This is done now because
 792 # various tables below get populated based on it.  These tables are populated
 793 # here to be near the top of the file, and so easily seeable by those needing
 794 # to modify things.
 795 open my $VERSION, "<", "version"
 796                     or croak "$0: can't open required file 'version': $!\n";
 797 my $string_version = <$VERSION>;
 798 close $VERSION;
 799 chomp $string_version;
 800 my $v_version = pack "C*", split /\./, $string_version;        # v string
 801
 802 # The following are the complete names of properties with property values that
 803 # are known to not match any code points in some versions of Unicode, but that
 804 # may change in the future so they should be matchable, hence an empty file is
 805 # generated for them.
 806 my @tables_that_may_be_empty = (
 807                                 'Joining_Type=Left_Joining',
 808                                 );
 809 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 810 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 811 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 812                                                     if $v_version ge v4.1.0;
 813 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
 814                                                     if $v_version ge v6.0.0;
 815 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
 816                                                     if $v_version ge v6.1.0;
 817
 818 # The lists below are hashes, so the key is the item in the list, and the
 819 # value is the reason why it is in the list.  This makes generation of
 820 # documentation easier.
 821
 822 my %why_suppressed;  # No file generated for these.
 823
 824 # Files aren't generated for empty extraneous properties.  This is arguable.
 825 # Extraneous properties generally come about because a property is no longer
 826 # used in a newer version of Unicode.  If we generated a file without code
 827 # points, programs that used to work on that property will still execute
 828 # without errors.  It just won't ever match (or will always match, with \P{}).
 829 # This means that the logic is now likely wrong.  I (khw) think its better to
 830 # find this out by getting an error message.  Just move them to the table
 831 # above to change this behavior
 832 my %why_suppress_if_empty_warn_if_not = (
 833
 834    # It is the only property that has ever officially been removed from the
 835    # Standard.  The database never contained any code points for it.
 836    'Special_Case_Condition' => 'Obsolete',
 837
 838    # Apparently never official, but there were code points in some versions of
 839    # old-style PropList.txt
 840    'Non_Break' => 'Obsolete',
 841 );
 842
 843 # These would normally go in the warn table just above, but they were changed
 844 # a long time before this program was written, so warnings about them are
 845 # moot.
 846 if ($v_version gt v3.2.0) {
 847     push @tables_that_may_be_empty,
 848                                 'Canonical_Combining_Class=Attached_Below_Left'
 849 }
 850
 851 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
 852 # unless explicitly added.
 853 if ($v_version ge v5.2.0) {
 854     my $unihan = 'Unihan; remove from list if using Unihan';
 855     foreach my $table (qw (
 856                            kAccountingNumeric
 857                            kOtherNumeric
 858                            kPrimaryNumeric
 859                            kCompatibilityVariant
 860                            kIICore
 861                            kIRG_GSource
 862                            kIRG_HSource
 863                            kIRG_JSource
 864                            kIRG_KPSource
 865                            kIRG_MSource
 866                            kIRG_KSource
 867                            kIRG_TSource
 868                            kIRG_USource
 869                            kIRG_VSource
 870                            kRSUnicode
 871                         ))
 872     {
 873         $why_suppress_if_empty_warn_if_not{$table} = $unihan;
 874     }
 875 }
 876
 877 # Enum values for to_output_map() method in the Map_Table package.
 878 my $EXTERNAL_MAP = 1;
 879 my $INTERNAL_MAP = 2;
 880 my $OUTPUT_ADJUSTED = 3;
 881
 882 # To override computed values for writing the map tables for these properties.
 883 # The default for enum map tables is to write them out, so that the Unicode
 884 # .txt files can be removed, but all the data to compute any property value
 885 # for any code point is available in a more compact form.
 886 my %global_to_output_map = (
 887     # Needed by UCD.pm, but don't want to publicize that it exists, so won't
 888     # get stuck supporting it if things change.  Since it is a STRING
 889     # property, it normally would be listed in the pod, but INTERNAL_MAP
 890     # suppresses that.
 891     Unicode_1_Name => $INTERNAL_MAP,
 892
 893     Present_In => 0,                # Suppress, as easily computed from Age
 894     Block => 0,                     # Suppress, as Blocks.txt is retained.
 895
 896     # Suppress, as mapping can be found instead from the
 897     # Perl_Decomposition_Mapping file
 898     Decomposition_Type => 0,
 899 );
 900
 901 # Properties that this program ignores.
 902 my @unimplemented_properties;
 903
 904 # With this release, it is automatically handled if the Unihan db is
 905 # downloaded
 906 push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
 907
 908 # There are several types of obsolete properties defined by Unicode.  These
 909 # must be hand-edited for every new Unicode release.
 910 my %why_deprecated;  # Generates a deprecated warning message if used.
 911 my %why_stabilized;  # Documentation only
 912 my %why_obsolete;    # Documentation only
 913
 914 {   # Closure
 915     my $simple = 'Perl uses the more complete version of this property';
 916     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 917
 918     my $other_properties = 'other properties';
 919     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 920     my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
 921
 922     %why_deprecated = (
 923         'Grapheme_Link' => 'Deprecated by Unicode:  Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
 924         'Jamo_Short_Name' => $contributory,
 925         'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 926         'Other_Alphabetic' => $contributory,
 927         'Other_Default_Ignorable_Code_Point' => $contributory,
 928         'Other_Grapheme_Extend' => $contributory,
 929         'Other_ID_Continue' => $contributory,
 930         'Other_ID_Start' => $contributory,
 931         'Other_Lowercase' => $contributory,
 932         'Other_Math' => $contributory,
 933         'Other_Uppercase' => $contributory,
 934         'Expands_On_NFC' => $why_no_expand,
 935         'Expands_On_NFD' => $why_no_expand,
 936         'Expands_On_NFKC' => $why_no_expand,
 937         'Expands_On_NFKD' => $why_no_expand,
 938     );
 939
 940     %why_suppressed = (
 941         # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
 942         # contains the same information, but without the algorithmically
 943         # determinable Hangul syllables'.  This file is not published, so it's
 944         # existence is not noted in the comment.
 945         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
 946
 947         'Indic_Matra_Category' => "Provisional",
 948         'Indic_Syllabic_Category' => "Provisional",
 949
 950         # Don't suppress ISO_Comment, as otherwise special handling is needed
 951         # to differentiate between it and gc=c, which can be written as 'isc',
 952         # which is the same characters as ISO_Comment's short name.
 953
 954         'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
 955
 956         'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
 957         'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 958         'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 959         'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 960
 961         FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
 962     );
 963
 964     foreach my $property (
 965
 966             # The following are suppressed because they were made contributory
 967             # or deprecated by Unicode before Perl ever thought about
 968             # supporting them.
 969             'Jamo_Short_Name',
 970             'Grapheme_Link',
 971             'Expands_On_NFC',
 972             'Expands_On_NFD',
 973             'Expands_On_NFKC',
 974             'Expands_On_NFKD',
 975
 976             # The following are suppressed because they have been marked
 977             # as deprecated for a sufficient amount of time
 978             'Other_Alphabetic',
 979             'Other_Default_Ignorable_Code_Point',
 980             'Other_Grapheme_Extend',
 981             'Other_ID_Continue',
 982             'Other_ID_Start',
 983             'Other_Lowercase',
 984             'Other_Math',
 985             'Other_Uppercase',
 986     ) {
 987         $why_suppressed{$property} = $why_deprecated{$property};
 988     }
 989
 990     # Customize the message for all the 'Other_' properties
 991     foreach my $property (keys %why_deprecated) {
 992         next if (my $main_property = $property) !~ s/^Other_//;
 993         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
 994     }
 995 }
 996
 997 if ($write_Unicode_deprecated_tables) {
 998     foreach my $property (keys %why_suppressed) {
 999         delete $why_suppressed{$property} if $property =~
1000                                                     / ^ Other | Grapheme /x;
1001     }
1002 }
1003
1004 if ($v_version ge 4.0.0) {
1005     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1006     if ($v_version ge 6.0.0) {
1007         $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1008     }
1009 }
1010 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1011     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1012     if ($v_version ge 6.0.0) {
1013         $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1014     }
1015 }
1016
1017 # Probably obsolete forever
1018 if ($v_version ge v4.1.0) {
1019     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
1020 }
1021 if ($v_version ge v6.0.0) {
1022     $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
1023     $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1024 }
1025
1026 # This program can create files for enumerated-like properties, such as
1027 # 'Numeric_Type'.  This file would be the same format as for a string
1028 # property, with a mapping from code point to its value, so you could look up,
1029 # for example, the script a code point is in.  But no one so far wants this
1030 # mapping, or they have found another way to get it since this is a new
1031 # feature.  So no file is generated except if it is in this list.
1032 my @output_mapped_properties = split "\n", <<END;
1033 END
1034
1035 # If you are using the Unihan database in a Unicode version before 5.2, you
1036 # need to add the properties that you want to extract from it to this table.
1037 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
1038 # listed, commented out
1039 my @cjk_properties = split "\n", <<'END';
1040 #cjkAccountingNumeric; kAccountingNumeric
1041 #cjkOtherNumeric; kOtherNumeric
1042 #cjkPrimaryNumeric; kPrimaryNumeric
1043 #cjkCompatibilityVariant; kCompatibilityVariant
1044 #cjkIICore ; kIICore
1045 #cjkIRG_GSource; kIRG_GSource
1046 #cjkIRG_HSource; kIRG_HSource
1047 #cjkIRG_JSource; kIRG_JSource
1048 #cjkIRG_KPSource; kIRG_KPSource
1049 #cjkIRG_KSource; kIRG_KSource
1050 #cjkIRG_TSource; kIRG_TSource
1051 #cjkIRG_USource; kIRG_USource
1052 #cjkIRG_VSource; kIRG_VSource
1053 #cjkRSUnicode; kRSUnicode                ; Unicode_Radical_Stroke; URS
1054 END
1055
1056 # Similarly for the property values.  For your convenience, the lines in the
1057 # 6.0 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
1058 # '#' marks (for Unicode versions before 5.2)
1059 my @cjk_property_values = split "\n", <<'END';
1060 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1061 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1062 ## @missing: 0000..10FFFF; cjkIICore; <none>
1063 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1064 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1065 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1066 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1067 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1068 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1069 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1070 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1071 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1072 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1073 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1074 END
1075
1076 # The input files don't list every code point.  Those not listed are to be
1077 # defaulted to some value.  Below are hard-coded what those values are for
1078 # non-binary properties as of 5.1.  Starting in 5.0, there are
1079 # machine-parsable comment lines in the files the give the defaults; so this
1080 # list shouldn't have to be extended.  The claim is that all missing entries
1081 # for binary properties will default to 'N'.  Unicode tried to change that in
1082 # 5.2, but the beta period produced enough protest that they backed off.
1083 #
1084 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1085 # be in the form that it expects.  The others may be synonyms.
1086 my $CODE_POINT = '<code point>';
1087 my %default_mapping = (
1088     Age => "Unassigned",
1089     # Bidi_Class => Complicated; set in code
1090     Bidi_Mirroring_Glyph => "",
1091     Block => 'No_Block',
1092     Canonical_Combining_Class => 0,
1093     Case_Folding => $CODE_POINT,
1094     Decomposition_Mapping => $CODE_POINT,
1095     Decomposition_Type => 'None',
1096     East_Asian_Width => "Neutral",
1097     FC_NFKC_Closure => $CODE_POINT,
1098     General_Category => 'Cn',
1099     Grapheme_Cluster_Break => 'Other',
1100     Hangul_Syllable_Type => 'NA',
1101     ISO_Comment => "",
1102     Jamo_Short_Name => "",
1103     Joining_Group => "No_Joining_Group",
1104     # Joining_Type => Complicated; set in code
1105     kIICore => 'N',   #                       Is converted to binary
1106     #Line_Break => Complicated; set in code
1107     Lowercase_Mapping => $CODE_POINT,
1108     Name => "",
1109     Name_Alias => "",
1110     NFC_QC => 'Yes',
1111     NFD_QC => 'Yes',
1112     NFKC_QC => 'Yes',
1113     NFKD_QC => 'Yes',
1114     Numeric_Type => 'None',
1115     Numeric_Value => 'NaN',
1116     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1117     Sentence_Break => 'Other',
1118     Simple_Case_Folding => $CODE_POINT,
1119     Simple_Lowercase_Mapping => $CODE_POINT,
1120     Simple_Titlecase_Mapping => $CODE_POINT,
1121     Simple_Uppercase_Mapping => $CODE_POINT,
1122     Titlecase_Mapping => $CODE_POINT,
1123     Unicode_1_Name => "",
1124     Unicode_Radical_Stroke => "",
1125     Uppercase_Mapping => $CODE_POINT,
1126     Word_Break => 'Other',
1127 );
1128
1129 # Below are files that Unicode furnishes, but this program ignores, and why.
1130 # NormalizationCorrections.txt requires some more explanation.  It documents
1131 # the cumulative fixes to erroneous normalizations in earlier Unicode
1132 # versions.  Its main purpose is so that someone running on an earlier version
1133 # can use this file to override what got published in that earlier release.
1134 # It would be easy for mktables to read and handle this file.  But all the
1135 # corrections in it should already be in the other files for the release it
1136 # is.  To get it to actually mean something useful, someone would have to be
1137 # using an earlier Unicode release, and copy it to the files for that release
1138 # and recomplile.  So far there has been no demand to do that, so this hasn't
1139 # been implemented.
1140 my %ignored_files = (
1141     'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1142     'Index.txt' => 'Alphabetical index of Unicode characters',
1143     'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1144     'NamesList.txt' => 'Annotated list of characters',
1145     'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1146     'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1147     'ReadMe.txt' => 'Documentation',
1148     'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized.  This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1149     'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1150     'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1151     'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1152     'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1153     'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
1154 );
1155
1156 my %skipped_files;  # List of files that we skip
1157
1158 ### End of externally interesting definitions, except for @input_file_objects
1159
1160 my $HEADER=<<"EOF";
1161 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1162 # This file is machine-generated by $0 from the Unicode
1163 # database, Version $string_version.  Any changes made here will be lost!
1164 EOF
1165
1166 my $INTERNAL_ONLY_HEADER = <<"EOF";
1167
1168 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1169 # This file is for internal use by core Perl only.  The format and even the
1170 # name or existence of this file are subject to change without notice.  Don't
1171 # use it directly.
1172 EOF
1173
1174 my $DEVELOPMENT_ONLY=<<"EOF";
1175 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1176 # This file contains information artificially constrained to code points
1177 # present in Unicode release $string_compare_versions.
1178 # IT CANNOT BE RELIED ON.  It is for use during development only and should
1179 # not be used for production.
1180
1181 EOF
1182
1183 my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1184 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1185 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1186
1187 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
1188 # two must be 10; if there are 5, the first must not be a 0.  Written this way
1189 # to decrease backtracking.  The first regex allows the code point to be at
1190 # the end of a word, but to work properly, the word shouldn't end with a valid
1191 # hex character.  The second one won't match a code point at the end of a
1192 # word, and doesn't have the run-on issue
1193 my $run_on_code_point_re =
1194             qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1195 my $code_point_re = qr/\b$run_on_code_point_re/;
1196
1197 # This matches the beginning of the line in the Unicode db files that give the
1198 # defaults for code points not listed (i.e., missing) in the file.  The code
1199 # depends on this ending with a semi-colon, so it can assume it is a valid
1200 # field when the line is split() by semi-colons
1201 my $missing_defaults_prefix =
1202             qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
1203
1204 # Property types.  Unicode has more types, but these are sufficient for our
1205 # purposes.
1206 my $UNKNOWN = -1;   # initialized to illegal value
1207 my $NON_STRING = 1; # Either binary or enum
1208 my $BINARY = 2;
1209 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1210                        # tables, additional true and false tables are
1211                        # generated so that false is anything matching the
1212                        # default value, and true is everything else.
1213 my $ENUM = 4;       # Include catalog
1214 my $STRING = 5;     # Anything else: string or misc
1215
1216 # Some input files have lines that give default values for code points not
1217 # contained in the file.  Sometimes these should be ignored.
1218 my $NO_DEFAULTS = 0;        # Must evaluate to false
1219 my $NOT_IGNORED = 1;
1220 my $IGNORED = 2;
1221
1222 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
1223 # and will appear in the main body of the tables in the output files, but
1224 # there are other types of ranges as well, listed below, that are specially
1225 # handled.   There are pseudo-types as well that will never be stored as a
1226 # type, but will affect the calculation of the type.
1227
1228 # 0 is for normal, non-specials
1229 my $MULTI_CP = 1;           # Sequence of more than code point
1230 my $HANGUL_SYLLABLE = 2;
1231 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1232 my $NULL = 4;               # The map is to the null string; utf8.c can't
1233                             # handle these, nor is there an accepted syntax
1234                             # for them in \p{} constructs
1235 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1236                              # otherwise be $MULTI_CP type are instead type 0
1237
1238 # process_generic_property_file() can accept certain overrides in its input.
1239 # Each of these must begin AND end with $CMD_DELIM.
1240 my $CMD_DELIM = "\a";
1241 my $REPLACE_CMD = 'replace';    # Override the Replace
1242 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1243
1244 my $NO = 0;
1245 my $YES = 1;
1246
1247 # Values for the Replace argument to add_range.
1248 # $NO                      # Don't replace; add only the code points not
1249                            # already present.
1250 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1251                            # the comments at the subroutine definition.
1252 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1253 my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1254                            # already there
1255 my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1256                            # already there
1257 my $CROAK = 6;             # Die with an error if is already there
1258
1259 # Flags to give property statuses.  The phrases are to remind maintainers that
1260 # if the flag is changed, the indefinite article referring to it in the
1261 # documentation may need to be as well.
1262 my $NORMAL = "";
1263 my $DEPRECATED = 'D';
1264 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1265 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1266 my $DISCOURAGED = 'X';
1267 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1268 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1269 my $STRICTER = 'T';
1270 my $a_bold_stricter = "a 'B<$STRICTER>'";
1271 my $A_bold_stricter = "A 'B<$STRICTER>'";
1272 my $STABILIZED = 'S';
1273 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1274 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1275 my $OBSOLETE = 'O';
1276 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1277 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1278
1279 my %status_past_participles = (
1280     $DISCOURAGED => 'discouraged',
1281     $STABILIZED => 'stabilized',
1282     $OBSOLETE => 'obsolete',
1283     $DEPRECATED => 'deprecated',
1284 );
1285
1286 # Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1287 # externally documented.
1288 my $ORDINARY = 0;       # The normal fate.
1289 my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1290                         # but there is a file written that can be used to
1291                         # reconstruct this table
1292 my $INTERNAL_ONLY = 2;  # The file for this table is written out, but it is
1293                         # for Perl's internal use only
1294 my $SUPPRESSED = 3;     # The file for this table is not written out, and as a
1295                         # result, we don't bother to do many computations on
1296                         # it.
1297 my $PLACEHOLDER = 4;    # Like $SUPPRESSED, but we go through all the
1298                         # computations anyway, as the values are needed for
1299                         # things to work.  This happens when we have Perl
1300                         # extensions that depend on Unicode tables that
1301                         # wouldn't normally be in a given Unicode version.
1302
1303 # The format of the values of the tables:
1304 my $EMPTY_FORMAT = "";
1305 my $BINARY_FORMAT = 'b';
1306 my $DECIMAL_FORMAT = 'd';
1307 my $FLOAT_FORMAT = 'f';
1308 my $INTEGER_FORMAT = 'i';
1309 my $HEX_FORMAT = 'x';
1310 my $RATIONAL_FORMAT = 'r';
1311 my $STRING_FORMAT = 's';
1312 my $ADJUST_FORMAT = 'a';
1313 my $DECOMP_STRING_FORMAT = 'c';
1314 my $STRING_WHITE_SPACE_LIST = 'sw';
1315
1316 my %map_table_formats = (
1317     $BINARY_FORMAT => 'binary',
1318     $DECIMAL_FORMAT => 'single decimal digit',
1319     $FLOAT_FORMAT => 'floating point number',
1320     $INTEGER_FORMAT => 'integer',
1321     $HEX_FORMAT => 'non-negative hex whole number; a code point',
1322     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1323     $STRING_FORMAT => 'string',
1324     $ADJUST_FORMAT => 'some entries need adjustment',
1325     $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1326     $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1327 );
1328
1329 # Unicode didn't put such derived files in a separate directory at first.
1330 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1331 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1332 my $AUXILIARY = 'auxiliary';
1333
1334 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1335 # and into UCD.pl for the use of UCD.pm
1336 my %loose_to_file_of;       # loosely maps table names to their respective
1337                             # files
1338 my %stricter_to_file_of;    # same; but for stricter mapping.
1339 my %loose_property_to_file_of; # Maps a loose property name to its map file
1340 my %file_to_swash_name;     # Maps the file name to its corresponding key name
1341                             # in the hash %utf8::SwashInfo
1342 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1343                              # their rational equivalent
1344 my %loose_property_name_of; # Loosely maps (non_string) property names to
1345                             # standard form
1346 my %string_property_loose_to_name; # Same, for string properties.
1347 my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1348                             # the property name in standard loose form, and
1349                             # 'value' is the default value for that property,
1350                             # also in standard loose form.
1351 my %loose_to_standard_value; # loosely maps table names to the canonical
1352                             # alias for them
1353 my %ambiguous_names;        # keys are alias names (in standard form) that
1354                             # have more than one possible meaning.
1355 my %prop_aliases;           # Keys are standard property name; values are each
1356                             # one's aliases
1357 my %prop_value_aliases;     # Keys of top level are standard property name;
1358                             # values are keys to another hash,  Each one is
1359                             # one of the property's values, in standard form.
1360                             # The values are that prop-val's aliases.
1361 my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1362
1363 # Most properties are immune to caseless matching, otherwise you would get
1364 # nonsensical results, as properties are a function of a code point, not
1365 # everything that is caselessly equivalent to that code point.  For example,
1366 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1367 # be true because 's' and 'S' are equivalent caselessly.  However,
1368 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1369 # extend that concept to those very few properties that are like this.  Each
1370 # such property will match the full range caselessly.  They are hard-coded in
1371 # the program; it's not worth trying to make it general as it's extremely
1372 # unlikely that they will ever change.
1373 my %caseless_equivalent_to;
1374
1375 # These constants names and values were taken from the Unicode standard,
1376 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1377 # syllables.  The '_string' versions are so generated tables can retain the
1378 # hex format, which is the more familiar value
1379 my $SBase_string = "0xAC00";
1380 my $SBase = CORE::hex $SBase_string;
1381 my $LBase_string = "0x1100";
1382 my $LBase = CORE::hex $LBase_string;
1383 my $VBase_string = "0x1161";
1384 my $VBase = CORE::hex $VBase_string;
1385 my $TBase_string = "0x11A7";
1386 my $TBase = CORE::hex $TBase_string;
1387 my $SCount = 11172;
1388 my $LCount = 19;
1389 my $VCount = 21;
1390 my $TCount = 28;
1391 my $NCount = $VCount * $TCount;
1392
1393 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1394 # with the above published constants.
1395 my %Jamo;
1396 my %Jamo_L;     # Leading consonants
1397 my %Jamo_V;     # Vowels
1398 my %Jamo_T;     # Trailing consonants
1399
1400 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1401 # The key is the base name of the code point, and the value is an
1402 # array giving all the ranges that use this base name.  Each range
1403 # is actually a hash giving the 'low' and 'high' values of it.
1404 my %names_ending_in_code_point;
1405 my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1406                                         # removed from the names
1407 # Inverse mapping.  The list of ranges that have these kinds of
1408 # names.  Each element contains the low, high, and base names in an
1409 # anonymous hash.
1410 my @code_points_ending_in_code_point;
1411
1412 # To hold Unicode's normalization test suite
1413 my @normalization_tests;
1414
1415 # Boolean: does this Unicode version have the hangul syllables, and are we
1416 # writing out a table for them?
1417 my $has_hangul_syllables = 0;
1418
1419 # Does this Unicode version have code points whose names end in their
1420 # respective code points, and are we writing out a table for them?  0 for no;
1421 # otherwise points to first property that a table is needed for them, so that
1422 # if multiple tables are needed, we don't create duplicates
1423 my $needing_code_points_ending_in_code_point = 0;
1424
1425 my @backslash_X_tests;     # List of tests read in for testing \X
1426 my @unhandled_properties;  # Will contain a list of properties found in
1427                            # the input that we didn't process.
1428 my @match_properties;      # Properties that have match tables, to be
1429                            # listed in the pod
1430 my @map_properties;        # Properties that get map files written
1431 my @named_sequences;       # NamedSequences.txt contents.
1432 my %potential_files;       # Generated list of all .txt files in the directory
1433                            # structure so we can warn if something is being
1434                            # ignored.
1435 my @files_actually_output; # List of files we generated.
1436 my @more_Names;            # Some code point names are compound; this is used
1437                            # to store the extra components of them.
1438 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1439                            # the minimum before we consider it equivalent to a
1440                            # candidate rational
1441 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1442
1443 # These store references to certain commonly used property objects
1444 my $gc;
1445 my $perl;
1446 my $block;
1447 my $perl_charname;
1448 my $print;
1449 my $Any;
1450 my $script;
1451
1452 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1453 my $has_In_conflicts = 0;
1454 my $has_Is_conflicts = 0;
1455
1456 sub internal_file_to_platform ($) {
1457     # Convert our file paths which have '/' separators to those of the
1458     # platform.
1459
1460     my $file = shift;
1461     return undef unless defined $file;
1462
1463     return File::Spec->join(split '/', $file);
1464 }
1465
1466 sub file_exists ($) {   # platform independent '-e'.  This program internally
1467                         # uses slash as a path separator.
1468     my $file = shift;
1469     return 0 if ! defined $file;
1470     return -e internal_file_to_platform($file);
1471 }
1472
1473 sub objaddr($) {
1474     # Returns the address of the blessed input object.
1475     # It doesn't check for blessedness because that would do a string eval
1476     # every call, and the program is structured so that this is never called
1477     # for a non-blessed object.
1478
1479     no overloading; # If overloaded, numifying below won't work.
1480
1481     # Numifying a ref gives its address.
1482     return pack 'J', $_[0];
1483 }
1484
1485 # These are used only if $annotate is true.
1486 # The entire range of Unicode characters is examined to populate these
1487 # after all the input has been processed.  But most can be skipped, as they
1488 # have the same descriptive phrases, such as being unassigned
1489 my @viacode;            # Contains the 1 million character names
1490 my @printable;          # boolean: And are those characters printable?
1491 my @annotate_char_type; # Contains a type of those characters, specifically
1492                         # for the purposes of annotation.
1493 my $annotate_ranges;    # A map of ranges of code points that have the same
1494                         # name for the purposes of annotation.  They map to the
1495                         # upper edge of the range, so that the end point can
1496                         # be immediately found.  This is used to skip ahead to
1497                         # the end of a range, and avoid processing each
1498                         # individual code point in it.
1499 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1500                                    # characters, but excluding those which are
1501                                    # also noncharacter code points
1502
1503 # The annotation types are an extension of the regular range types, though
1504 # some of the latter are folded into one.  Make the new types negative to
1505 # avoid conflicting with the regular types
1506 my $SURROGATE_TYPE = -1;
1507 my $UNASSIGNED_TYPE = -2;
1508 my $PRIVATE_USE_TYPE = -3;
1509 my $NONCHARACTER_TYPE = -4;
1510 my $CONTROL_TYPE = -5;
1511 my $UNKNOWN_TYPE = -6;  # Used only if there is a bug in this program
1512
1513 sub populate_char_info ($) {
1514     # Used only with the $annotate option.  Populates the arrays with the
1515     # input code point's info that are needed for outputting more detailed
1516     # comments.  If calling context wants a return, it is the end point of
1517     # any contiguous range of characters that share essentially the same info
1518
1519     my $i = shift;
1520     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1521
1522     $viacode[$i] = $perl_charname->value_of($i) || "";
1523
1524     # A character is generally printable if Unicode says it is,
1525     # but below we make sure that most Unicode general category 'C' types
1526     # aren't.
1527     $printable[$i] = $print->contains($i);
1528
1529     $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1530
1531     # Only these two regular types are treated specially for annotations
1532     # purposes
1533     $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1534                                 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1535
1536     # Give a generic name to all code points that don't have a real name.
1537     # We output ranges, if applicable, for these.  Also calculate the end
1538     # point of the range.
1539     my $end;
1540     if (! $viacode[$i]) {
1541         my $nonchar;
1542         if ($gc-> table('Private_use')->contains($i)) {
1543             $viacode[$i] = 'Private Use';
1544             $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1545             $printable[$i] = 0;
1546             $end = $gc->table('Private_Use')->containing_range($i)->end;
1547         }
1548         elsif ((defined ($nonchar =
1549                             Property::property_ref('Noncharacter_Code_Point'))
1550                && $nonchar->table('Y')->contains($i)))
1551         {
1552             $viacode[$i] = 'Noncharacter';
1553             $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1554             $printable[$i] = 0;
1555             $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1556                                                     containing_range($i)->end;
1557         }
1558         elsif ($gc-> table('Control')->contains($i)) {
1559             $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control';
1560             $annotate_char_type[$i] = $CONTROL_TYPE;
1561             $printable[$i] = 0;
1562         }
1563         elsif ($gc-> table('Unassigned')->contains($i)) {
1564             $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1565             $printable[$i] = 0;
1566             if ($v_version lt v2.0.0) { # No blocks in earliest releases
1567                 $viacode[$i] = 'Unassigned';
1568                 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1569             }
1570             else {
1571                 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1572
1573                 # Because we name the unassigned by the blocks they are in, it
1574                 # can't go past the end of that block, and it also can't go
1575                 # past the unassigned range it is in.  The special table makes
1576                 # sure that the non-characters, which are unassigned, are
1577                 # separated out.
1578                 $end = min($block->containing_range($i)->end,
1579                            $unassigned_sans_noncharacters->
1580                                                     containing_range($i)->end);
1581             }
1582         }
1583         elsif ($v_version lt v2.0.0) {  # No surrogates in earliest releases
1584             $viacode[$i] = $gc->value_of($i);
1585             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1586             $printable[$i] = 0;
1587         }
1588         elsif ($gc-> table('Surrogate')->contains($i)) {
1589             $viacode[$i] = 'Surrogate';
1590             $annotate_char_type[$i] = $SURROGATE_TYPE;
1591             $printable[$i] = 0;
1592             $end = $gc->table('Surrogate')->containing_range($i)->end;
1593         }
1594         else {
1595             Carp::my_carp_bug("Can't figure out how to annotate "
1596                               . sprintf("U+%04X", $i)
1597                               . ".  Proceeding anyway.");
1598             $viacode[$i] = 'UNKNOWN';
1599             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1600             $printable[$i] = 0;
1601         }
1602     }
1603
1604     # Here, has a name, but if it's one in which the code point number is
1605     # appended to the name, do that.
1606     elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1607         $viacode[$i] .= sprintf("-%04X", $i);
1608         $end = $perl_charname->containing_range($i)->end;
1609     }
1610
1611     # And here, has a name, but if it's a hangul syllable one, replace it with
1612     # the correct name from the Unicode algorithm
1613     elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1614         use integer;
1615         my $SIndex = $i - $SBase;
1616         my $L = $LBase + $SIndex / $NCount;
1617         my $V = $VBase + ($SIndex % $NCount) / $TCount;
1618         my $T = $TBase + $SIndex % $TCount;
1619         $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1620         $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1621         $end = $perl_charname->containing_range($i)->end;
1622     }
1623
1624     return if ! defined wantarray;
1625     return $i if ! defined $end;    # If not a range, return the input
1626
1627     # Save this whole range so can find the end point quickly
1628     $annotate_ranges->add_map($i, $end, $end);
1629
1630     return $end;
1631 }
1632
1633 # Commented code below should work on Perl 5.8.
1634 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1635 ## the native perl version of it (which is what would operate under miniperl)
1636 ## is extremely slow, as it does a string eval every call.
1637 #my $has_fast_scalar_util = $\18 !~ /miniperl/
1638 #                            && defined eval "require Scalar::Util";
1639 #
1640 #sub objaddr($) {
1641 #    # Returns the address of the blessed input object.  Uses the XS version if
1642 #    # available.  It doesn't check for blessedness because that would do a
1643 #    # string eval every call, and the program is structured so that this is
1644 #    # never called for a non-blessed object.
1645 #
1646 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1647 #
1648 #    # Check at least that is a ref.
1649 #    my $pkg = ref($_[0]) or return undef;
1650 #
1651 #    # Change to a fake package to defeat any overloaded stringify
1652 #    bless $_[0], 'main::Fake';
1653 #
1654 #    # Numifying a ref gives its address.
1655 #    my $addr = pack 'J', $_[0];
1656 #
1657 #    # Return to original class
1658 #    bless $_[0], $pkg;
1659 #    return $addr;
1660 #}
1661
1662 sub max ($$) {
1663     my $a = shift;
1664     my $b = shift;
1665     return $a if $a >= $b;
1666     return $b;
1667 }
1668
1669 sub min ($$) {
1670     my $a = shift;
1671     my $b = shift;
1672     return $a if $a <= $b;
1673     return $b;
1674 }
1675
1676 sub clarify_number ($) {
1677     # This returns the input number with underscores inserted every 3 digits
1678     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1679     # checked.
1680
1681     my $number = shift;
1682     my $pos = length($number) - 3;
1683     return $number if $pos <= 1;
1684     while ($pos > 0) {
1685         substr($number, $pos, 0) = '_';
1686         $pos -= 3;
1687     }
1688     return $number;
1689 }
1690
1691
1692 package Carp;
1693
1694 # These routines give a uniform treatment of messages in this program.  They
1695 # are placed in the Carp package to cause the stack trace to not include them,
1696 # although an alternative would be to use another package and set @CARP_NOT
1697 # for it.
1698
1699 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1700
1701 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1702 # and overload trying to load Scalar:Util under miniperl.  See
1703 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1704 undef $overload::VERSION;
1705
1706 sub my_carp {
1707     my $message = shift || "";
1708     my $nofold = shift || 0;
1709
1710     if ($message) {
1711         $message = main::join_lines($message);
1712         $message =~ s/^$0: *//;     # Remove initial program name
1713         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1714         $message = "\n$0: $message;";
1715
1716         # Fold the message with program name, semi-colon end punctuation
1717         # (which looks good with the message that carp appends to it), and a
1718         # hanging indent for continuation lines.
1719         $message = main::simple_fold($message, "", 4) unless $nofold;
1720         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1721                                     # appends is to the same line
1722     }
1723
1724     return $message if defined wantarray;   # If a caller just wants the msg
1725
1726     carp $message;
1727     return;
1728 }
1729
1730 sub my_carp_bug {
1731     # This is called when it is clear that the problem is caused by a bug in
1732     # this program.
1733
1734     my $message = shift;
1735     $message =~ s/^$0: *//;
1736     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1737     carp $message;
1738     return;
1739 }
1740
1741 sub carp_too_few_args {
1742     if (@_ != 2) {
1743         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1744         return;
1745     }
1746
1747     my $args_ref = shift;
1748     my $count = shift;
1749
1750     my_carp_bug("Need at least $count arguments to "
1751         . (caller 1)[3]
1752         . ".  Instead got: '"
1753         . join ', ', @$args_ref
1754         . "'.  No action taken.");
1755     return;
1756 }
1757
1758 sub carp_extra_args {
1759     my $args_ref = shift;
1760     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1761
1762     unless (ref $args_ref) {
1763         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1764         return;
1765     }
1766     my ($package, $file, $line) = caller;
1767     my $subroutine = (caller 1)[3];
1768
1769     my $list;
1770     if (ref $args_ref eq 'HASH') {
1771         foreach my $key (keys %$args_ref) {
1772             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1773         }
1774         $list = join ', ', each %{$args_ref};
1775     }
1776     elsif (ref $args_ref eq 'ARRAY') {
1777         foreach my $arg (@$args_ref) {
1778             $arg = $UNDEF unless defined $arg;
1779         }
1780         $list = join ', ', @$args_ref;
1781     }
1782     else {
1783         my_carp_bug("Can't cope with ref "
1784                 . ref($args_ref)
1785                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1786         return;
1787     }
1788
1789     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1790     return;
1791 }
1792
1793 package main;
1794
1795 { # Closure
1796
1797     # This program uses the inside-out method for objects, as recommended in
1798     # "Perl Best Practices".  This closure aids in generating those.  There
1799     # are two routines.  setup_package() is called once per package to set
1800     # things up, and then set_access() is called for each hash representing a
1801     # field in the object.  These routines arrange for the object to be
1802     # properly destroyed when no longer used, and for standard accessor
1803     # functions to be generated.  If you need more complex accessors, just
1804     # write your own and leave those accesses out of the call to set_access().
1805     # More details below.
1806
1807     my %constructor_fields; # fields that are to be used in constructors; see
1808                             # below
1809
1810     # The values of this hash will be the package names as keys to other
1811     # hashes containing the name of each field in the package as keys, and
1812     # references to their respective hashes as values.
1813     my %package_fields;
1814
1815     sub setup_package {
1816         # Sets up the package, creating standard DESTROY and dump methods
1817         # (unless already defined).  The dump method is used in debugging by
1818         # simple_dumper().
1819         # The optional parameters are:
1820         #   a)  a reference to a hash, that gets populated by later
1821         #       set_access() calls with one of the accesses being
1822         #       'constructor'.  The caller can then refer to this, but it is
1823         #       not otherwise used by these two routines.
1824         #   b)  a reference to a callback routine to call during destruction
1825         #       of the object, before any fields are actually destroyed
1826
1827         my %args = @_;
1828         my $constructor_ref = delete $args{'Constructor_Fields'};
1829         my $destroy_callback = delete $args{'Destroy_Callback'};
1830         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1831
1832         my %fields;
1833         my $package = (caller)[0];
1834
1835         $package_fields{$package} = \%fields;
1836         $constructor_fields{$package} = $constructor_ref;
1837
1838         unless ($package->can('DESTROY')) {
1839             my $destroy_name = "${package}::DESTROY";
1840             no strict "refs";
1841
1842             # Use typeglob to give the anonymous subroutine the name we want
1843             *$destroy_name = sub {
1844                 my $self = shift;
1845                 my $addr = do { no overloading; pack 'J', $self; };
1846
1847                 $self->$destroy_callback if $destroy_callback;
1848                 foreach my $field (keys %{$package_fields{$package}}) {
1849                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1850                     delete $package_fields{$package}{$field}{$addr};
1851                 }
1852                 return;
1853             }
1854         }
1855
1856         unless ($package->can('dump')) {
1857             my $dump_name = "${package}::dump";
1858             no strict "refs";
1859             *$dump_name = sub {
1860                 my $self = shift;
1861                 return dump_inside_out($self, $package_fields{$package}, @_);
1862             }
1863         }
1864         return;
1865     }
1866
1867     sub set_access {
1868         # Arrange for the input field to be garbage collected when no longer
1869         # needed.  Also, creates standard accessor functions for the field
1870         # based on the optional parameters-- none if none of these parameters:
1871         #   'addable'    creates an 'add_NAME()' accessor function.
1872         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1873         #                function.
1874         #   'settable'   creates a 'set_NAME()' accessor function.
1875         #   'constructor' doesn't create an accessor function, but adds the
1876         #                field to the hash that was previously passed to
1877         #                setup_package();
1878         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1879         # 'add' etc. all mean 'addable'.
1880         # The read accessor function will work on both array and scalar
1881         # values.  If another accessor in the parameter list is 'a', the read
1882         # access assumes an array.  You can also force it to be array access
1883         # by specifying 'readable_array' instead of 'readable'
1884         #
1885         # A sort-of 'protected' access can be set-up by preceding the addable,
1886         # readable or settable with some initial portion of 'protected_' (but,
1887         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1888         # "protection" is only by convention.  All that happens is that the
1889         # accessor functions' names begin with an underscore.  So instead of
1890         # calling set_foo, the call is _set_foo.  (Real protection could be
1891         # accomplished by having a new subroutine, end_package, called at the
1892         # end of each package, and then storing the __LINE__ ranges and
1893         # checking them on every accessor.  But that is way overkill.)
1894
1895         # We create anonymous subroutines as the accessors and then use
1896         # typeglobs to assign them to the proper package and name
1897
1898         my $name = shift;   # Name of the field
1899         my $field = shift;  # Reference to the inside-out hash containing the
1900                             # field
1901
1902         my $package = (caller)[0];
1903
1904         if (! exists $package_fields{$package}) {
1905             croak "$0: Must call 'setup_package' before 'set_access'";
1906         }
1907
1908         # Stash the field so DESTROY can get it.
1909         $package_fields{$package}{$name} = $field;
1910
1911         # Remaining arguments are the accessors.  For each...
1912         foreach my $access (@_) {
1913             my $access = lc $access;
1914
1915             my $protected = "";
1916
1917             # Match the input as far as it goes.
1918             if ($access =~ /^(p[^_]*)_/) {
1919                 $protected = $1;
1920                 if (substr('protected_', 0, length $protected)
1921                     eq $protected)
1922                 {
1923
1924                     # Add 1 for the underscore not included in $protected
1925                     $access = substr($access, length($protected) + 1);
1926                     $protected = '_';
1927                 }
1928                 else {
1929                     $protected = "";
1930                 }
1931             }
1932
1933             if (substr('addable', 0, length $access) eq $access) {
1934                 my $subname = "${package}::${protected}add_$name";
1935                 no strict "refs";
1936
1937                 # add_ accessor.  Don't add if already there, which we
1938                 # determine using 'eq' for scalars and '==' otherwise.
1939                 *$subname = sub {
1940                     use strict "refs";
1941                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1942                     my $self = shift;
1943                     my $value = shift;
1944                     my $addr = do { no overloading; pack 'J', $self; };
1945                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1946                     if (ref $value) {
1947                         return if grep { $value == $_ } @{$field->{$addr}};
1948                     }
1949                     else {
1950                         return if grep { $value eq $_ } @{$field->{$addr}};
1951                     }
1952                     push @{$field->{$addr}}, $value;
1953                     return;
1954                 }
1955             }
1956             elsif (substr('constructor', 0, length $access) eq $access) {
1957                 if ($protected) {
1958                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
1959                 }
1960                 else {
1961                     $constructor_fields{$package}{$name} = $field;
1962                 }
1963             }
1964             elsif (substr('readable_array', 0, length $access) eq $access) {
1965
1966                 # Here has read access.  If one of the other parameters for
1967                 # access is array, or this one specifies array (by being more
1968                 # than just 'readable_'), then create a subroutine that
1969                 # assumes the data is an array.  Otherwise just a scalar
1970                 my $subname = "${package}::${protected}$name";
1971                 if (grep { /^a/i } @_
1972                     or length($access) > length('readable_'))
1973                 {
1974                     no strict "refs";
1975                     *$subname = sub {
1976                         use strict "refs";
1977                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1978                         my $addr = do { no overloading; pack 'J', $_[0]; };
1979                         if (ref $field->{$addr} ne 'ARRAY') {
1980                             my $type = ref $field->{$addr};
1981                             $type = 'scalar' unless $type;
1982                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1983                             return;
1984                         }
1985                         return scalar @{$field->{$addr}} unless wantarray;
1986
1987                         # Make a copy; had problems with caller modifying the
1988                         # original otherwise
1989                         my @return = @{$field->{$addr}};
1990                         return @return;
1991                     }
1992                 }
1993                 else {
1994
1995                     # Here not an array value, a simpler function.
1996                     no strict "refs";
1997                     *$subname = sub {
1998                         use strict "refs";
1999                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2000                         no overloading;
2001                         return $field->{pack 'J', $_[0]};
2002                     }
2003                 }
2004             }
2005             elsif (substr('settable', 0, length $access) eq $access) {
2006                 my $subname = "${package}::${protected}set_$name";
2007                 no strict "refs";
2008                 *$subname = sub {
2009                     use strict "refs";
2010                     if (main::DEBUG) {
2011                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2012                         Carp::carp_extra_args(\@_) if @_ > 2;
2013                     }
2014                     # $self is $_[0]; $value is $_[1]
2015                     no overloading;
2016                     $field->{pack 'J', $_[0]} = $_[1];
2017                     return;
2018                 }
2019             }
2020             else {
2021                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
2022             }
2023         }
2024         return;
2025     }
2026 }
2027
2028 package Input_file;
2029
2030 # All input files use this object, which stores various attributes about them,
2031 # and provides for convenient, uniform handling.  The run method wraps the
2032 # processing.  It handles all the bookkeeping of opening, reading, and closing
2033 # the file, returning only significant input lines.
2034 #
2035 # Each object gets a handler which processes the body of the file, and is
2036 # called by run().  Most should use the generic, default handler, which has
2037 # code scrubbed to handle things you might not expect.  A handler should
2038 # basically be a while(next_line()) {...} loop.
2039 #
2040 # You can also set up handlers to
2041 #   1) call before the first line is read for pre processing
2042 #   2) call to adjust each line of the input before the main handler gets them
2043 #   3) call upon EOF before the main handler exits its loop
2044 #   4) call at the end for post processing
2045 #
2046 # $_ is used to store the input line, and is to be filtered by the
2047 # each_line_handler()s.  So, if the format of the line is not in the desired
2048 # format for the main handler, these are used to do that adjusting.  They can
2049 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2050 # so the $_ output of one is used as the input to the next.  None of the other
2051 # handlers are stackable, but could easily be changed to be so.
2052 #
2053 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2054 # which insert the parameters as lines to be processed before the next input
2055 # file line is read.  This allows the EOF handler to flush buffers, for
2056 # example.  The difference between the two routines is that the lines inserted
2057 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
2058 # called it from such a handler, you would get infinite recursion.)  Lines
2059 # inserted by insert_adjusted_lines() go directly to the main handler without
2060 # any adjustments.  If the  post-processing handler calls any of these, there
2061 # will be no effect.  Some error checking for these conditions could be added,
2062 # but it hasn't been done.
2063 #
2064 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2065 # to prevent further processing of the line.  This routine will output the
2066 # message as a warning once, and then keep a count of the lines that have the
2067 # same message, and output that count at the end of the file's processing.
2068 # This keeps the number of messages down to a manageable amount.
2069 #
2070 # get_missings() should be called to retrieve any @missing input lines.
2071 # Messages will be raised if this isn't done if the options aren't to ignore
2072 # missings.
2073
2074 sub trace { return main::trace(@_); }
2075
2076 { # Closure
2077     # Keep track of fields that are to be put into the constructor.
2078     my %constructor_fields;
2079
2080     main::setup_package(Constructor_Fields => \%constructor_fields);
2081
2082     my %file; # Input file name, required
2083     main::set_access('file', \%file, qw{ c r });
2084
2085     my %first_released; # Unicode version file was first released in, required
2086     main::set_access('first_released', \%first_released, qw{ c r });
2087
2088     my %handler;    # Subroutine to process the input file, defaults to
2089                     # 'process_generic_property_file'
2090     main::set_access('handler', \%handler, qw{ c });
2091
2092     my %property;
2093     # name of property this file is for.  defaults to none, meaning not
2094     # applicable, or is otherwise determinable, for example, from each line.
2095     main::set_access('property', \%property, qw{ c r });
2096
2097     my %optional;
2098     # If this is true, the file is optional.  If not present, no warning is
2099     # output.  If it is present, the string given by this parameter is
2100     # evaluated, and if false the file is not processed.
2101     main::set_access('optional', \%optional, 'c', 'r');
2102
2103     my %non_skip;
2104     # This is used for debugging, to skip processing of all but a few input
2105     # files.  Add 'non_skip => 1' to the constructor for those files you want
2106     # processed when you set the $debug_skip global.
2107     main::set_access('non_skip', \%non_skip, 'c');
2108
2109     my %skip;
2110     # This is used to skip processing of this input file semi-permanently,
2111     # when it evaluates to true.  The value should be the reason the file is
2112     # being skipped.  It is used for files that we aren't planning to process
2113     # anytime soon, but want to allow to be in the directory and not raise a
2114     # message that we are not handling.  Mostly for test files.  This is in
2115     # contrast to the non_skip element, which is supposed to be used very
2116     # temporarily for debugging.  Sets 'optional' to 1.  Also, files that we
2117     # pretty much will never look at can be placed in the global
2118     # %ignored_files instead.  Ones used here will be added to %skipped files
2119     main::set_access('skip', \%skip, 'c');
2120
2121     my %each_line_handler;
2122     # list of subroutines to look at and filter each non-comment line in the
2123     # file.  defaults to none.  The subroutines are called in order, each is
2124     # to adjust $_ for the next one, and the final one adjusts it for
2125     # 'handler'
2126     main::set_access('each_line_handler', \%each_line_handler, 'c');
2127
2128     my %has_missings_defaults;
2129     # ? Are there lines in the file giving default values for code points
2130     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2131     # the norm, but IGNORED means it has such lines, but the handler doesn't
2132     # use them.  Having these three states allows us to catch changes to the
2133     # UCD that this program should track
2134     main::set_access('has_missings_defaults',
2135                                         \%has_missings_defaults, qw{ c r });
2136
2137     my %pre_handler;
2138     # Subroutine to call before doing anything else in the file.  If undef, no
2139     # such handler is called.
2140     main::set_access('pre_handler', \%pre_handler, qw{ c });
2141
2142     my %eof_handler;
2143     # Subroutine to call upon getting an EOF on the input file, but before
2144     # that is returned to the main handler.  This is to allow buffers to be
2145     # flushed.  The handler is expected to call insert_lines() or
2146     # insert_adjusted() with the buffered material
2147     main::set_access('eof_handler', \%eof_handler, qw{ c r });
2148
2149     my %post_handler;
2150     # Subroutine to call after all the lines of the file are read in and
2151     # processed.  If undef, no such handler is called.
2152     main::set_access('post_handler', \%post_handler, qw{ c });
2153
2154     my %progress_message;
2155     # Message to print to display progress in lieu of the standard one
2156     main::set_access('progress_message', \%progress_message, qw{ c });
2157
2158     my %handle;
2159     # cache open file handle, internal.  Is undef if file hasn't been
2160     # processed at all, empty if has;
2161     main::set_access('handle', \%handle);
2162
2163     my %added_lines;
2164     # cache of lines added virtually to the file, internal
2165     main::set_access('added_lines', \%added_lines);
2166
2167     my %errors;
2168     # cache of errors found, internal
2169     main::set_access('errors', \%errors);
2170
2171     my %missings;
2172     # storage of '@missing' defaults lines
2173     main::set_access('missings', \%missings);
2174
2175     sub new {
2176         my $class = shift;
2177
2178         my $self = bless \do{ my $anonymous_scalar }, $class;
2179         my $addr = do { no overloading; pack 'J', $self; };
2180
2181         # Set defaults
2182         $handler{$addr} = \&main::process_generic_property_file;
2183         $non_skip{$addr} = 0;
2184         $skip{$addr} = 0;
2185         $has_missings_defaults{$addr} = $NO_DEFAULTS;
2186         $handle{$addr} = undef;
2187         $added_lines{$addr} = [ ];
2188         $each_line_handler{$addr} = [ ];
2189         $errors{$addr} = { };
2190         $missings{$addr} = [ ];
2191
2192         # Two positional parameters.
2193         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2194         $file{$addr} = main::internal_file_to_platform(shift);
2195         $first_released{$addr} = shift;
2196
2197         # The rest of the arguments are key => value pairs
2198         # %constructor_fields has been set up earlier to list all possible
2199         # ones.  Either set or push, depending on how the default has been set
2200         # up just above.
2201         my %args = @_;
2202         foreach my $key (keys %args) {
2203             my $argument = $args{$key};
2204
2205             # Note that the fields are the lower case of the constructor keys
2206             my $hash = $constructor_fields{lc $key};
2207             if (! defined $hash) {
2208                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2209                 next;
2210             }
2211             if (ref $hash->{$addr} eq 'ARRAY') {
2212                 if (ref $argument eq 'ARRAY') {
2213                     foreach my $argument (@{$argument}) {
2214                         next if ! defined $argument;
2215                         push @{$hash->{$addr}}, $argument;
2216                     }
2217                 }
2218                 else {
2219                     push @{$hash->{$addr}}, $argument if defined $argument;
2220                 }
2221             }
2222             else {
2223                 $hash->{$addr} = $argument;
2224             }
2225             delete $args{$key};
2226         };
2227
2228         # If the file has a property for it, it means that the property is not
2229         # listed in the file's entries.  So add a handler to the list of line
2230         # handlers to insert the property name into the lines, to provide a
2231         # uniform interface to the final processing subroutine.
2232         # the final code doesn't have to worry about that.
2233         if ($property{$addr}) {
2234             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2235         }
2236
2237         if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2238             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2239         }
2240
2241         # If skipping, set to optional, and add to list of ignored files,
2242         # including its reason
2243         if ($skip{$addr}) {
2244             $optional{$addr} = 1;
2245             $skipped_files{$file{$addr}} = $skip{$addr}
2246         }
2247
2248         return $self;
2249     }
2250
2251
2252     use overload
2253         fallback => 0,
2254         qw("") => "_operator_stringify",
2255         "." => \&main::_operator_dot,
2256         ".=" => \&main::_operator_dot_equal,
2257     ;
2258
2259     sub _operator_stringify {
2260         my $self = shift;
2261
2262         return __PACKAGE__ . " object for " . $self->file;
2263     }
2264
2265     # flag to make sure extracted files are processed early
2266     my $seen_non_extracted_non_age = 0;
2267
2268     sub run {
2269         # Process the input object $self.  This opens and closes the file and
2270         # calls all the handlers for it.  Currently,  this can only be called
2271         # once per file, as it destroy's the EOF handler
2272
2273         my $self = shift;
2274         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2275
2276         my $addr = do { no overloading; pack 'J', $self; };
2277
2278         my $file = $file{$addr};
2279
2280         # Don't process if not expecting this file (because released later
2281         # than this Unicode version), and isn't there.  This means if someone
2282         # copies it into an earlier version's directory, we will go ahead and
2283         # process it.
2284         return if $first_released{$addr} gt $v_version && ! -e $file;
2285
2286         # If in debugging mode and this file doesn't have the non-skip
2287         # flag set, and isn't one of the critical files, skip it.
2288         if ($debug_skip
2289             && $first_released{$addr} ne v0
2290             && ! $non_skip{$addr})
2291         {
2292             print "Skipping $file in debugging\n" if $verbosity;
2293             return;
2294         }
2295
2296         # File could be optional
2297         if ($optional{$addr}) {
2298             return unless -e $file;
2299             my $result = eval $optional{$addr};
2300             if (! defined $result) {
2301                 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}.  $file Skipped.");
2302                 return;
2303             }
2304             if (! $result) {
2305                 if ($verbosity) {
2306                     print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2307                 }
2308                 return;
2309             }
2310         }
2311
2312         if (! defined $file || ! -e $file) {
2313
2314             # If the file doesn't exist, see if have internal data for it
2315             # (based on first_released being 0).
2316             if ($first_released{$addr} eq v0) {
2317                 $handle{$addr} = 'pretend_is_open';
2318             }
2319             else {
2320                 if (! $optional{$addr}  # File could be optional
2321                     && $v_version ge $first_released{$addr})
2322                 {
2323                     print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2324                 }
2325                 return;
2326             }
2327         }
2328         else {
2329
2330             # Here, the file exists.  Some platforms may change the case of
2331             # its name
2332             if ($seen_non_extracted_non_age) {
2333                 if ($file =~ /$EXTRACTED/i) {
2334                     Carp::my_carp_bug(main::join_lines(<<END
2335 $file should be processed just after the 'Prop...Alias' files, and before
2336 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2337 have subtle problems
2338 END
2339                     ));
2340                 }
2341             }
2342             elsif ($EXTRACTED_DIR
2343                     && $first_released{$addr} ne v0
2344                     && $file !~ /$EXTRACTED/i
2345                     && lc($file) ne 'dage.txt')
2346             {
2347                 # We don't set this (by the 'if' above) if we have no
2348                 # extracted directory, so if running on an early version,
2349                 # this test won't work.  Not worth worrying about.
2350                 $seen_non_extracted_non_age = 1;
2351             }
2352
2353             # And mark the file as having being processed, and warn if it
2354             # isn't a file we are expecting.  As we process the files,
2355             # they are deleted from the hash, so any that remain at the
2356             # end of the program are files that we didn't process.
2357             my $fkey = File::Spec->rel2abs($file);
2358             my $expecting = delete $potential_files{lc($fkey)};
2359
2360             Carp::my_carp("Was not expecting '$file'.") if
2361                     ! $expecting
2362                     && ! defined $handle{$addr};
2363
2364             # Having deleted from expected files, we can quit if not to do
2365             # anything.  Don't print progress unless really want verbosity
2366             if ($skip{$addr}) {
2367                 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2368                 return;
2369             }
2370
2371             # Open the file, converting the slashes used in this program
2372             # into the proper form for the OS
2373             my $file_handle;
2374             if (not open $file_handle, "<", $file) {
2375                 Carp::my_carp("Can't open $file.  Skipping: $!");
2376                 return 0;
2377             }
2378             $handle{$addr} = $file_handle; # Cache the open file handle
2379         }
2380
2381         if ($verbosity >= $PROGRESS) {
2382             if ($progress_message{$addr}) {
2383                 print "$progress_message{$addr}\n";
2384             }
2385             else {
2386                 # If using a virtual file, say so.
2387                 print "Processing ", (-e $file)
2388                                        ? $file
2389                                        : "substitute $file",
2390                                      "\n";
2391             }
2392         }
2393
2394
2395         # Call any special handler for before the file.
2396         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2397
2398         # Then the main handler
2399         &{$handler{$addr}}($self);
2400
2401         # Then any special post-file handler.
2402         &{$post_handler{$addr}}($self) if $post_handler{$addr};
2403
2404         # If any errors have been accumulated, output the counts (as the first
2405         # error message in each class was output when it was encountered).
2406         if ($errors{$addr}) {
2407             my $total = 0;
2408             my $types = 0;
2409             foreach my $error (keys %{$errors{$addr}}) {
2410                 $total += $errors{$addr}->{$error};
2411                 delete $errors{$addr}->{$error};
2412                 $types++;
2413             }
2414             if ($total > 1) {
2415                 my $message
2416                         = "A total of $total lines had errors in $file.  ";
2417
2418                 $message .= ($types == 1)
2419                             ? '(Only the first one was displayed.)'
2420                             : '(Only the first of each type was displayed.)';
2421                 Carp::my_carp($message);
2422             }
2423         }
2424
2425         if (@{$missings{$addr}}) {
2426             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2427         }
2428
2429         # If a real file handle, close it.
2430         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2431                                                         ref $handle{$addr};
2432         $handle{$addr} = "";   # Uses empty to indicate that has already seen
2433                                # the file, as opposed to undef
2434         return;
2435     }
2436
2437     sub next_line {
2438         # Sets $_ to be the next logical input line, if any.  Returns non-zero
2439         # if such a line exists.  'logical' means that any lines that have
2440         # been added via insert_lines() will be returned in $_ before the file
2441         # is read again.
2442
2443         my $self = shift;
2444         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2445
2446         my $addr = do { no overloading; pack 'J', $self; };
2447
2448         # Here the file is open (or if the handle is not a ref, is an open
2449         # 'virtual' file).  Get the next line; any inserted lines get priority
2450         # over the file itself.
2451         my $adjusted;
2452
2453         LINE:
2454         while (1) { # Loop until find non-comment, non-empty line
2455             #local $to_trace = 1 if main::DEBUG;
2456             my $inserted_ref = shift @{$added_lines{$addr}};
2457             if (defined $inserted_ref) {
2458                 ($adjusted, $_) = @{$inserted_ref};
2459                 trace $adjusted, $_ if main::DEBUG && $to_trace;
2460                 return 1 if $adjusted;
2461             }
2462             else {
2463                 last if ! ref $handle{$addr}; # Don't read unless is real file
2464                 last if ! defined ($_ = readline $handle{$addr});
2465             }
2466             chomp;
2467             trace $_ if main::DEBUG && $to_trace;
2468
2469             # See if this line is the comment line that defines what property
2470             # value that code points that are not listed in the file should
2471             # have.  The format or existence of these lines is not guaranteed
2472             # by Unicode since they are comments, but the documentation says
2473             # that this was added for machine-readability, so probably won't
2474             # change.  This works starting in Unicode Version 5.0.  They look
2475             # like:
2476             #
2477             # @missing: 0000..10FFFF; Not_Reordered
2478             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2479             # @missing: 0000..10FFFF; ; NaN
2480             #
2481             # Save the line for a later get_missings() call.
2482             if (/$missing_defaults_prefix/) {
2483                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2484                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2485                 }
2486                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2487                     my @defaults = split /\s* ; \s*/x, $_;
2488
2489                     # The first field is the @missing, which ends in a
2490                     # semi-colon, so can safely shift.
2491                     shift @defaults;
2492
2493                     # Some of these lines may have empty field placeholders
2494                     # which get in the way.  An example is:
2495                     # @missing: 0000..10FFFF; ; NaN
2496                     # Remove them.  Process starting from the top so the
2497                     # splice doesn't affect things still to be looked at.
2498                     for (my $i = @defaults - 1; $i >= 0; $i--) {
2499                         next if $defaults[$i] ne "";
2500                         splice @defaults, $i, 1;
2501                     }
2502
2503                     # What's left should be just the property (maybe) and the
2504                     # default.  Having only one element means it doesn't have
2505                     # the property.
2506                     my $default;
2507                     my $property;
2508                     if (@defaults >= 1) {
2509                         if (@defaults == 1) {
2510                             $default = $defaults[0];
2511                         }
2512                         else {
2513                             $property = $defaults[0];
2514                             $default = $defaults[1];
2515                         }
2516                     }
2517
2518                     if (@defaults < 1
2519                         || @defaults > 2
2520                         || ($default =~ /^</
2521                             && $default !~ /^<code *point>$/i
2522                             && $default !~ /^<none>$/i
2523                             && $default !~ /^<script>$/i))
2524                     {
2525                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2526                     }
2527                     else {
2528
2529                         # If the property is missing from the line, it should
2530                         # be the one for the whole file
2531                         $property = $property{$addr} if ! defined $property;
2532
2533                         # Change <none> to the null string, which is what it
2534                         # really means.  If the default is the code point
2535                         # itself, set it to <code point>, which is what
2536                         # Unicode uses (but sometimes they've forgotten the
2537                         # space)
2538                         if ($default =~ /^<none>$/i) {
2539                             $default = "";
2540                         }
2541                         elsif ($default =~ /^<code *point>$/i) {
2542                             $default = $CODE_POINT;
2543                         }
2544                         elsif ($default =~ /^<script>$/i) {
2545
2546                             # Special case this one.  Currently is from
2547                             # ScriptExtensions.txt, and means for all unlisted
2548                             # code points, use their Script property values.
2549                             # For the code points not listed in that file, the
2550                             # default value is 'Unknown'.
2551                             $default = "Unknown";
2552                         }
2553
2554                         # Store them as a sub-arrays with both components.
2555                         push @{$missings{$addr}}, [ $default, $property ];
2556                     }
2557                 }
2558
2559                 # There is nothing for the caller to process on this comment
2560                 # line.
2561                 next;
2562             }
2563
2564             # Remove comments and trailing space, and skip this line if the
2565             # result is empty
2566             s/#.*//;
2567             s/\s+$//;
2568             next if /^$/;
2569
2570             # Call any handlers for this line, and skip further processing of
2571             # the line if the handler sets the line to null.
2572             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2573                 &{$sub_ref}($self);
2574                 next LINE if /^$/;
2575             }
2576
2577             # Here the line is ok.  return success.
2578             return 1;
2579         } # End of looping through lines.
2580
2581         # If there is an EOF handler, call it (only once) and if it generates
2582         # more lines to process go back in the loop to handle them.
2583         if ($eof_handler{$addr}) {
2584             &{$eof_handler{$addr}}($self);
2585             $eof_handler{$addr} = "";   # Currently only get one shot at it.
2586             goto LINE if $added_lines{$addr};
2587         }
2588
2589         # Return failure -- no more lines.
2590         return 0;
2591
2592     }
2593
2594 #   Not currently used, not fully tested.
2595 #    sub peek {
2596 #        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2597 #        # record.  Not callable from an each_line_handler(), nor does it call
2598 #        # an each_line_handler() on the line.
2599 #
2600 #        my $self = shift;
2601 #        my $addr = do { no overloading; pack 'J', $self; };
2602 #
2603 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
2604 #            my ($adjusted, $line) = @{$inserted_ref};
2605 #            next if $adjusted;
2606 #
2607 #            # Remove comments and trailing space, and return a non-empty
2608 #            # resulting line
2609 #            $line =~ s/#.*//;
2610 #            $line =~ s/\s+$//;
2611 #            return $line if $line ne "";
2612 #        }
2613 #
2614 #        return if ! ref $handle{$addr}; # Don't read unless is real file
2615 #        while (1) { # Loop until find non-comment, non-empty line
2616 #            local $to_trace = 1 if main::DEBUG;
2617 #            trace $_ if main::DEBUG && $to_trace;
2618 #            return if ! defined (my $line = readline $handle{$addr});
2619 #            chomp $line;
2620 #            push @{$added_lines{$addr}}, [ 0, $line ];
2621 #
2622 #            $line =~ s/#.*//;
2623 #            $line =~ s/\s+$//;
2624 #            return $line if $line ne "";
2625 #        }
2626 #
2627 #        return;
2628 #    }
2629
2630
2631     sub insert_lines {
2632         # Lines can be inserted so that it looks like they were in the input
2633         # file at the place it was when this routine is called.  See also
2634         # insert_adjusted_lines().  Lines inserted via this routine go through
2635         # any each_line_handler()
2636
2637         my $self = shift;
2638
2639         # Each inserted line is an array, with the first element being 0 to
2640         # indicate that this line hasn't been adjusted, and needs to be
2641         # processed.
2642         no overloading;
2643         push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2644         return;
2645     }
2646
2647     sub insert_adjusted_lines {
2648         # Lines can be inserted so that it looks like they were in the input
2649         # file at the place it was when this routine is called.  See also
2650         # insert_lines().  Lines inserted via this routine are already fully
2651         # adjusted, ready to be processed; each_line_handler()s handlers will
2652         # not be called.  This means this is not a completely general
2653         # facility, as only the last each_line_handler on the stack should
2654         # call this.  It could be made more general, by passing to each of the
2655         # line_handlers their position on the stack, which they would pass on
2656         # to this routine, and that would replace the boolean first element in
2657         # the anonymous array pushed here, so that the next_line routine could
2658         # use that to call only those handlers whose index is after it on the
2659         # stack.  But this is overkill for what is needed now.
2660
2661         my $self = shift;
2662         trace $_[0] if main::DEBUG && $to_trace;
2663
2664         # Each inserted line is an array, with the first element being 1 to
2665         # indicate that this line has been adjusted
2666         no overloading;
2667         push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2668         return;
2669     }
2670
2671     sub get_missings {
2672         # Returns the stored up @missings lines' values, and clears the list.
2673         # The values are in an array, consisting of the default in the first
2674         # element, and the property in the 2nd.  However, since these lines
2675         # can be stacked up, the return is an array of all these arrays.
2676
2677         my $self = shift;
2678         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2679
2680         my $addr = do { no overloading; pack 'J', $self; };
2681
2682         # If not accepting a list return, just return the first one.
2683         return shift @{$missings{$addr}} unless wantarray;
2684
2685         my @return = @{$missings{$addr}};
2686         undef @{$missings{$addr}};
2687         return @return;
2688     }
2689
2690     sub _insert_property_into_line {
2691         # Add a property field to $_, if this file requires it.
2692
2693         my $self = shift;
2694         my $addr = do { no overloading; pack 'J', $self; };
2695         my $property = $property{$addr};
2696         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2697
2698         $_ =~ s/(;|$)/; $property$1/;
2699         return;
2700     }
2701
2702     sub carp_bad_line {
2703         # Output consistent error messages, using either a generic one, or the
2704         # one given by the optional parameter.  To avoid gazillions of the
2705         # same message in case the syntax of a  file is way off, this routine
2706         # only outputs the first instance of each message, incrementing a
2707         # count so the totals can be output at the end of the file.
2708
2709         my $self = shift;
2710         my $message = shift;
2711         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2712
2713         my $addr = do { no overloading; pack 'J', $self; };
2714
2715         $message = 'Unexpected line' unless $message;
2716
2717         # No trailing punctuation so as to fit with our addenda.
2718         $message =~ s/[.:;,]$//;
2719
2720         # If haven't seen this exact message before, output it now.  Otherwise
2721         # increment the count of how many times it has occurred
2722         unless ($errors{$addr}->{$message}) {
2723             Carp::my_carp("$message in '$_' in "
2724                             . $file{$addr}
2725                             . " at line $..  Skipping this line;");
2726             $errors{$addr}->{$message} = 1;
2727         }
2728         else {
2729             $errors{$addr}->{$message}++;
2730         }
2731
2732         # Clear the line to prevent any further (meaningful) processing of it.
2733         $_ = "";
2734
2735         return;
2736     }
2737 } # End closure
2738
2739 package Multi_Default;
2740
2741 # Certain properties in early versions of Unicode had more than one possible
2742 # default for code points missing from the files.  In these cases, one
2743 # default applies to everything left over after all the others are applied,
2744 # and for each of the others, there is a description of which class of code
2745 # points applies to it.  This object helps implement this by storing the
2746 # defaults, and for all but that final default, an eval string that generates
2747 # the class that it applies to.
2748
2749
2750 {   # Closure
2751
2752     main::setup_package();
2753
2754     my %class_defaults;
2755     # The defaults structure for the classes
2756     main::set_access('class_defaults', \%class_defaults);
2757
2758     my %other_default;
2759     # The default that applies to everything left over.
2760     main::set_access('other_default', \%other_default, 'r');
2761
2762
2763     sub new {
2764         # The constructor is called with default => eval pairs, terminated by
2765         # the left-over default. e.g.
2766         # Multi_Default->new(
2767         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2768         #               -  0x200D',
2769         #        'R' => 'some other expression that evaluates to code points',
2770         #        .
2771         #        .
2772         #        .
2773         #        'U'));
2774
2775         my $class = shift;
2776
2777         my $self = bless \do{my $anonymous_scalar}, $class;
2778         my $addr = do { no overloading; pack 'J', $self; };
2779
2780         while (@_ > 1) {
2781             my $default = shift;
2782             my $eval = shift;
2783             $class_defaults{$addr}->{$default} = $eval;
2784         }
2785
2786         $other_default{$addr} = shift;
2787
2788         return $self;
2789     }
2790
2791     sub get_next_defaults {
2792         # Iterates and returns the next class of defaults.
2793         my $self = shift;
2794         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2795
2796         my $addr = do { no overloading; pack 'J', $self; };
2797
2798         return each %{$class_defaults{$addr}};
2799     }
2800 }
2801
2802 package Alias;
2803
2804 # An alias is one of the names that a table goes by.  This class defines them
2805 # including some attributes.  Everything is currently setup in the
2806 # constructor.
2807
2808
2809 {   # Closure
2810
2811     main::setup_package();
2812
2813     my %name;
2814     main::set_access('name', \%name, 'r');
2815
2816     my %loose_match;
2817     # Should this name match loosely or not.
2818     main::set_access('loose_match', \%loose_match, 'r');
2819
2820     my %make_re_pod_entry;
2821     # Some aliases should not get their own entries in the re section of the
2822     # pod, because they are covered by a wild-card, and some we want to
2823     # discourage use of.  Binary
2824     main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
2825
2826     my %ucd;
2827     # Is this documented to be accessible via Unicode::UCD
2828     main::set_access('ucd', \%ucd, 'r', 's');
2829
2830     my %status;
2831     # Aliases have a status, like deprecated, or even suppressed (which means
2832     # they don't appear in documentation).  Enum
2833     main::set_access('status', \%status, 'r');
2834
2835     my %ok_as_filename;
2836     # Similarly, some aliases should not be considered as usable ones for
2837     # external use, such as file names, or we don't want documentation to
2838     # recommend them.  Boolean
2839     main::set_access('ok_as_filename', \%ok_as_filename, 'r');
2840
2841     sub new {
2842         my $class = shift;
2843
2844         my $self = bless \do { my $anonymous_scalar }, $class;
2845         my $addr = do { no overloading; pack 'J', $self; };
2846
2847         $name{$addr} = shift;
2848         $loose_match{$addr} = shift;
2849         $make_re_pod_entry{$addr} = shift;
2850         $ok_as_filename{$addr} = shift;
2851         $status{$addr} = shift;
2852         $ucd{$addr} = shift;
2853
2854         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2855
2856         # Null names are never ok externally
2857         $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
2858
2859         return $self;
2860     }
2861 }
2862
2863 package Range;
2864
2865 # A range is the basic unit for storing code points, and is described in the
2866 # comments at the beginning of the program.  Each range has a starting code
2867 # point; an ending code point (not less than the starting one); a value
2868 # that applies to every code point in between the two end-points, inclusive;
2869 # and an enum type that applies to the value.  The type is for the user's
2870 # convenience, and has no meaning here, except that a non-zero type is
2871 # considered to not obey the normal Unicode rules for having standard forms.
2872 #
2873 # The same structure is used for both map and match tables, even though in the
2874 # latter, the value (and hence type) is irrelevant and could be used as a
2875 # comment.  In map tables, the value is what all the code points in the range
2876 # map to.  Type 0 values have the standardized version of the value stored as
2877 # well, so as to not have to recalculate it a lot.
2878
2879 sub trace { return main::trace(@_); }
2880
2881 {   # Closure
2882
2883     main::setup_package();
2884
2885     my %start;
2886     main::set_access('start', \%start, 'r', 's');
2887
2888     my %end;
2889     main::set_access('end', \%end, 'r', 's');
2890
2891     my %value;
2892     main::set_access('value', \%value, 'r');
2893
2894     my %type;
2895     main::set_access('type', \%type, 'r');
2896
2897     my %standard_form;
2898     # The value in internal standard form.  Defined only if the type is 0.
2899     main::set_access('standard_form', \%standard_form);
2900
2901     # Note that if these fields change, the dump() method should as well
2902
2903     sub new {
2904         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2905         my $class = shift;
2906
2907         my $self = bless \do { my $anonymous_scalar }, $class;
2908         my $addr = do { no overloading; pack 'J', $self; };
2909
2910         $start{$addr} = shift;
2911         $end{$addr} = shift;
2912
2913         my %args = @_;
2914
2915         my $value = delete $args{'Value'};  # Can be 0
2916         $value = "" unless defined $value;
2917         $value{$addr} = $value;
2918
2919         $type{$addr} = delete $args{'Type'} || 0;
2920
2921         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2922
2923         return $self;
2924     }
2925
2926     use overload
2927         fallback => 0,
2928         qw("") => "_operator_stringify",
2929         "." => \&main::_operator_dot,
2930         ".=" => \&main::_operator_dot_equal,
2931     ;
2932
2933     sub _operator_stringify {
2934         my $self = shift;
2935         my $addr = do { no overloading; pack 'J', $self; };
2936
2937         # Output it like '0041..0065 (value)'
2938         my $return = sprintf("%04X", $start{$addr})
2939                         .  '..'
2940                         . sprintf("%04X", $end{$addr});
2941         my $value = $value{$addr};
2942         my $type = $type{$addr};
2943         $return .= ' (';
2944         $return .= "$value";
2945         $return .= ", Type=$type" if $type != 0;
2946         $return .= ')';
2947
2948         return $return;
2949     }
2950
2951     sub standard_form {
2952         # Calculate the standard form only if needed, and cache the result.
2953         # The standard form is the value itself if the type is special.
2954         # This represents a considerable CPU and memory saving - at the time
2955         # of writing there are 368676 non-special objects, but the standard
2956         # form is only requested for 22047 of them - ie about 6%.
2957
2958         my $self = shift;
2959         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2960
2961         my $addr = do { no overloading; pack 'J', $self; };
2962
2963         return $standard_form{$addr} if defined $standard_form{$addr};
2964
2965         my $value = $value{$addr};
2966         return $value if $type{$addr};
2967         return $standard_form{$addr} = main::standardize($value);
2968     }
2969
2970     sub dump {
2971         # Human, not machine readable.  For machine readable, comment out this
2972         # entire routine and let the standard one take effect.
2973         my $self = shift;
2974         my $indent = shift;
2975         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2976
2977         my $addr = do { no overloading; pack 'J', $self; };
2978
2979         my $return = $indent
2980                     . sprintf("%04X", $start{$addr})
2981                     . '..'
2982                     . sprintf("%04X", $end{$addr})
2983                     . " '$value{$addr}';";
2984         if (! defined $standard_form{$addr}) {
2985             $return .= "(type=$type{$addr})";
2986         }
2987         elsif ($standard_form{$addr} ne $value{$addr}) {
2988             $return .= "(standard '$standard_form{$addr}')";
2989         }
2990         return $return;
2991     }
2992 } # End closure
2993
2994 package _Range_List_Base;
2995
2996 # Base class for range lists.  A range list is simply an ordered list of
2997 # ranges, so that the ranges with the lowest starting numbers are first in it.
2998 #
2999 # When a new range is added that is adjacent to an existing range that has the
3000 # same value and type, it merges with it to form a larger range.
3001 #
3002 # Ranges generally do not overlap, except that there can be multiple entries
3003 # of single code point ranges.  This is because of NameAliases.txt.
3004 #
3005 # In this program, there is a standard value such that if two different
3006 # values, have the same standard value, they are considered equivalent.  This
3007 # value was chosen so that it gives correct results on Unicode data
3008
3009 # There are a number of methods to manipulate range lists, and some operators
3010 # are overloaded to handle them.
3011
3012 sub trace { return main::trace(@_); }
3013
3014 { # Closure
3015
3016     our $addr;
3017
3018     main::setup_package();
3019
3020     my %ranges;
3021     # The list of ranges
3022     main::set_access('ranges', \%ranges, 'readable_array');
3023
3024     my %max;
3025     # The highest code point in the list.  This was originally a method, but
3026     # actual measurements said it was used a lot.
3027     main::set_access('max', \%max, 'r');
3028
3029     my %each_range_iterator;
3030     # Iterator position for each_range()
3031     main::set_access('each_range_iterator', \%each_range_iterator);
3032
3033     my %owner_name_of;
3034     # Name of parent this is attached to, if any.  Solely for better error
3035     # messages.
3036     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3037
3038     my %_search_ranges_cache;
3039     # A cache of the previous result from _search_ranges(), for better
3040     # performance
3041     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3042
3043     sub new {
3044         my $class = shift;
3045         my %args = @_;
3046
3047         # Optional initialization data for the range list.
3048         my $initialize = delete $args{'Initialize'};
3049
3050         my $self;
3051
3052         # Use _union() to initialize.  _union() returns an object of this
3053         # class, which means that it will call this constructor recursively.
3054         # But it won't have this $initialize parameter so that it won't
3055         # infinitely loop on this.
3056         return _union($class, $initialize, %args) if defined $initialize;
3057
3058         $self = bless \do { my $anonymous_scalar }, $class;
3059         my $addr = do { no overloading; pack 'J', $self; };
3060
3061         # Optional parent object, only for debug info.
3062         $owner_name_of{$addr} = delete $args{'Owner'};
3063         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3064
3065         # Stringify, in case it is an object.
3066         $owner_name_of{$addr} = "$owner_name_of{$addr}";
3067
3068         # This is used only for error messages, and so a colon is added
3069         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3070
3071         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3072
3073         # Max is initialized to a negative value that isn't adjacent to 0,
3074         # for simpler tests
3075         $max{$addr} = -2;
3076
3077         $_search_ranges_cache{$addr} = 0;
3078         $ranges{$addr} = [];
3079
3080         return $self;
3081     }
3082
3083     use overload
3084         fallback => 0,
3085         qw("") => "_operator_stringify",
3086         "." => \&main::_operator_dot,
3087         ".=" => \&main::_operator_dot_equal,
3088     ;
3089
3090     sub _operator_stringify {
3091         my $self = shift;
3092         my $addr = do { no overloading; pack 'J', $self; };
3093
3094         return "Range_List attached to '$owner_name_of{$addr}'"
3095                                                 if $owner_name_of{$addr};
3096         return "anonymous Range_List " . \$self;
3097     }
3098
3099     sub _union {
3100         # Returns the union of the input code points.  It can be called as
3101         # either a constructor or a method.  If called as a method, the result
3102         # will be a new() instance of the calling object, containing the union
3103         # of that object with the other parameter's code points;  if called as
3104         # a constructor, the first parameter gives the class that the new object
3105         # should be, and the second parameter gives the code points to go into
3106         # it.
3107         # In either case, there are two parameters looked at by this routine;
3108         # any additional parameters are passed to the new() constructor.
3109         #
3110         # The code points can come in the form of some object that contains
3111         # ranges, and has a conventionally named method to access them; or
3112         # they can be an array of individual code points (as integers); or
3113         # just a single code point.
3114         #
3115         # If they are ranges, this routine doesn't make any effort to preserve
3116         # the range values and types of one input over the other.  Therefore
3117         # this base class should not allow _union to be called from other than
3118         # initialization code, so as to prevent two tables from being added
3119         # together where the range values matter.  The general form of this
3120         # routine therefore belongs in a derived class, but it was moved here
3121         # to avoid duplication of code.  The failure to overload this in this
3122         # class keeps it safe.
3123         #
3124         # It does make the effort during initialization to accept tables with
3125         # multiple values for the same code point, and to preserve the order
3126         # of these.  If there is only one input range or range set, it doesn't
3127         # sort (as it should already be sorted to the desired order), and will
3128         # accept multiple values per code point.  Otherwise it will merge
3129         # multiple values into a single one.
3130
3131         my $self;
3132         my @args;   # Arguments to pass to the constructor
3133
3134         my $class = shift;
3135
3136         # If a method call, will start the union with the object itself, and
3137         # the class of the new object will be the same as self.
3138         if (ref $class) {
3139             $self = $class;
3140             $class = ref $self;
3141             push @args, $self;
3142         }
3143
3144         # Add the other required parameter.
3145         push @args, shift;
3146         # Rest of parameters are passed on to the constructor
3147
3148         # Accumulate all records from both lists.
3149         my @records;
3150         my $input_count = 0;
3151         for my $arg (@args) {
3152             #local $to_trace = 0 if main::DEBUG;
3153             trace "argument = $arg" if main::DEBUG && $to_trace;
3154             if (! defined $arg) {
3155                 my $message = "";
3156                 if (defined $self) {
3157                     no overloading;
3158                     $message .= $owner_name_of{pack 'J', $self};
3159                 }
3160                 Carp::my_carp_bug($message . "Undefined argument to _union.  No union done.");
3161                 return;
3162             }
3163
3164             $arg = [ $arg ] if ! ref $arg;
3165             my $type = ref $arg;
3166             if ($type eq 'ARRAY') {
3167                 foreach my $element (@$arg) {
3168                     push @records, Range->new($element, $element);
3169                     $input_count++;
3170                 }
3171             }
3172             elsif ($arg->isa('Range')) {
3173                 push @records, $arg;
3174                 $input_count++;
3175             }
3176             elsif ($arg->can('ranges')) {
3177                 push @records, $arg->ranges;
3178                 $input_count++;
3179             }
3180             else {
3181                 my $message = "";
3182                 if (defined $self) {
3183                     no overloading;
3184                     $message .= $owner_name_of{pack 'J', $self};
3185                 }
3186                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3187                 return;
3188             }
3189         }
3190
3191         # Sort with the range containing the lowest ordinal first, but if
3192         # two ranges start at the same code point, sort with the bigger range
3193         # of the two first, because it takes fewer cycles.
3194         if ($input_count > 1) {
3195             @records = sort { ($a->start <=> $b->start)
3196                                       or
3197                                     # if b is shorter than a, b->end will be
3198                                     # less than a->end, and we want to select
3199                                     # a, so want to return -1
3200                                     ($b->end <=> $a->end)
3201                                    } @records;
3202         }
3203
3204         my $new = $class->new(@_);
3205
3206         # Fold in records so long as they add new information.
3207         for my $set (@records) {
3208             my $start = $set->start;
3209             my $end   = $set->end;
3210             my $value = $set->value;
3211             my $type  = $set->type;
3212             if ($start > $new->max) {
3213                 $new->_add_delete('+', $start, $end, $value, Type => $type);
3214             }
3215             elsif ($end > $new->max) {
3216                 $new->_add_delete('+', $new->max +1, $end, $value,
3217                                                                 Type => $type);
3218             }
3219             elsif ($input_count == 1) {
3220                 # Here, overlaps existing range, but is from a single input,
3221                 # so preserve the multiple values from that input.
3222                 $new->_add_delete('+', $start, $end, $value, Type => $type,
3223                                                 Replace => $MULTIPLE_AFTER);
3224             }
3225         }
3226
3227         return $new;
3228     }
3229
3230     sub range_count {        # Return the number of ranges in the range list
3231         my $self = shift;
3232         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3233
3234         no overloading;
3235         return scalar @{$ranges{pack 'J', $self}};
3236     }
3237
3238     sub min {
3239         # Returns the minimum code point currently in the range list, or if
3240         # the range list is empty, 2 beyond the max possible.  This is a
3241         # method because used so rarely, that not worth saving between calls,
3242         # and having to worry about changing it as ranges are added and
3243         # deleted.
3244
3245         my $self = shift;
3246         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3247
3248         my $addr = do { no overloading; pack 'J', $self; };
3249
3250         # If the range list is empty, return a large value that isn't adjacent
3251         # to any that could be in the range list, for simpler tests
3252         return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3253         return $ranges{$addr}->[0]->start;
3254     }
3255
3256     sub contains {
3257         # Boolean: Is argument in the range list?  If so returns $i such that:
3258         #   range[$i]->end < $codepoint <= range[$i+1]->end
3259         # which is one beyond what you want; this is so that the 0th range
3260         # doesn't return false
3261         my $self = shift;
3262         my $codepoint = shift;
3263         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3264
3265         my $i = $self->_search_ranges($codepoint);
3266         return 0 unless defined $i;
3267
3268         # The search returns $i, such that
3269         #   range[$i-1]->end < $codepoint <= range[$i]->end
3270         # So is in the table if and only iff it is at least the start position
3271         # of range $i.
3272         no overloading;
3273         return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3274         return $i + 1;
3275     }
3276
3277     sub containing_range {
3278         # Returns the range object that contains the code point, undef if none
3279
3280         my $self = shift;
3281         my $codepoint = shift;
3282         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3283
3284         my $i = $self->contains($codepoint);
3285         return unless $i;
3286
3287         # contains() returns 1 beyond where we should look
3288         no overloading;
3289         return $ranges{pack 'J', $self}->[$i-1];
3290     }
3291
3292     sub value_of {
3293         # Returns the value associated with the code point, undef if none
3294
3295         my $self = shift;
3296         my $codepoint = shift;
3297         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3298
3299         my $range = $self->containing_range($codepoint);
3300         return unless defined $range;
3301
3302         return $range->value;
3303     }
3304
3305     sub type_of {
3306         # Returns the type of the range containing the code point, undef if
3307         # the code point is not in the table
3308
3309         my $self = shift;
3310         my $codepoint = shift;
3311         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3312
3313         my $range = $self->containing_range($codepoint);
3314         return unless defined $range;
3315
3316         return $range->type;
3317     }
3318
3319     sub _search_ranges {
3320         # Find the range in the list which contains a code point, or where it
3321         # should go if were to add it.  That is, it returns $i, such that:
3322         #   range[$i-1]->end < $codepoint <= range[$i]->end
3323         # Returns undef if no such $i is possible (e.g. at end of table), or
3324         # if there is an error.
3325
3326         my $self = shift;
3327         my $code_point = shift;
3328         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3329
3330         my $addr = do { no overloading; pack 'J', $self; };
3331
3332         return if $code_point > $max{$addr};
3333         my $r = $ranges{$addr};                # The current list of ranges
3334         my $range_list_size = scalar @$r;
3335         my $i;
3336
3337         use integer;        # want integer division
3338
3339         # Use the cached result as the starting guess for this one, because,
3340         # an experiment on 5.1 showed that 90% of the time the cache was the
3341         # same as the result on the next call (and 7% it was one less).
3342         $i = $_search_ranges_cache{$addr};
3343         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3344                                             # from an intervening deletion
3345         #local $to_trace = 1 if main::DEBUG;
3346         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3347         return $i if $code_point <= $r->[$i]->end
3348                      && ($i == 0 || $r->[$i-1]->end < $code_point);
3349
3350         # Here the cache doesn't yield the correct $i.  Try adding 1.
3351         if ($i < $range_list_size - 1
3352             && $r->[$i]->end < $code_point &&
3353             $code_point <= $r->[$i+1]->end)
3354         {
3355             $i++;
3356             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3357             $_search_ranges_cache{$addr} = $i;
3358             return $i;
3359         }
3360
3361         # Here, adding 1 also didn't work.  We do a binary search to
3362         # find the correct position, starting with current $i
3363         my $lower = 0;
3364         my $upper = $range_list_size - 1;
3365         while (1) {
3366             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3367
3368             if ($code_point <= $r->[$i]->end) {
3369
3370                 # Here we have met the upper constraint.  We can quit if we
3371                 # also meet the lower one.
3372                 last if $i == 0 || $r->[$i-1]->end < $code_point;
3373
3374                 $upper = $i;        # Still too high.
3375
3376             }
3377             else {
3378
3379                 # Here, $r[$i]->end < $code_point, so look higher up.
3380                 $lower = $i;
3381             }
3382
3383             # Split search domain in half to try again.
3384             my $temp = ($upper + $lower) / 2;
3385
3386             # No point in continuing unless $i changes for next time
3387             # in the loop.
3388             if ($temp == $i) {
3389
3390                 # We can't reach the highest element because of the averaging.
3391                 # So if one below the upper edge, force it there and try one
3392                 # more time.
3393                 if ($i == $range_list_size - 2) {
3394
3395                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3396                     $i = $range_list_size - 1;
3397
3398                     # Change $lower as well so if fails next time through,
3399                     # taking the average will yield the same $i, and we will
3400                     # quit with the error message just below.
3401                     $lower = $i;
3402                     next;
3403                 }
3404                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
3405                 return;
3406             }
3407             $i = $temp;
3408         } # End of while loop
3409
3410         if (main::DEBUG && $to_trace) {
3411             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3412             trace "i=  [ $i ]", $r->[$i];
3413             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3414         }
3415
3416         # Here we have found the offset.  Cache it as a starting point for the
3417         # next call.
3418         $_search_ranges_cache{$addr} = $i;
3419         return $i;
3420     }
3421
3422     sub _add_delete {
3423         # Add, replace or delete ranges to or from a list.  The $type
3424         # parameter gives which:
3425         #   '+' => insert or replace a range, returning a list of any changed
3426         #          ranges.
3427         #   '-' => delete a range, returning a list of any deleted ranges.
3428         #
3429         # The next three parameters give respectively the start, end, and
3430         # value associated with the range.  'value' should be null unless the
3431         # operation is '+';
3432         #
3433         # The range list is kept sorted so that the range with the lowest
3434         # starting position is first in the list, and generally, adjacent
3435         # ranges with the same values are merged into a single larger one (see
3436         # exceptions below).
3437         #
3438         # There are more parameters; all are key => value pairs:
3439         #   Type    gives the type of the value.  It is only valid for '+'.
3440         #           All ranges have types; if this parameter is omitted, 0 is
3441         #           assumed.  Ranges with type 0 are assumed to obey the
3442         #           Unicode rules for casing, etc; ranges with other types are
3443         #           not.  Otherwise, the type is arbitrary, for the caller's
3444         #           convenience, and looked at only by this routine to keep
3445         #           adjacent ranges of different types from being merged into
3446         #           a single larger range, and when Replace =>
3447         #           $IF_NOT_EQUIVALENT is specified (see just below).
3448         #   Replace  determines what to do if the range list already contains
3449         #            ranges which coincide with all or portions of the input
3450         #            range.  It is only valid for '+':
3451         #       => $NO            means that the new value is not to replace
3452         #                         any existing ones, but any empty gaps of the
3453         #                         range list coinciding with the input range
3454         #                         will be filled in with the new value.
3455         #       => $UNCONDITIONALLY  means to replace the existing values with
3456         #                         this one unconditionally.  However, if the
3457         #                         new and old values are identical, the
3458         #                         replacement is skipped to save cycles
3459         #       => $IF_NOT_EQUIVALENT means to replace the existing values
3460         #          (the default)  with this one if they are not equivalent.
3461         #                         Ranges are equivalent if their types are the
3462         #                         same, and they are the same string; or if
3463         #                         both are type 0 ranges, if their Unicode
3464         #                         standard forms are identical.  In this last
3465         #                         case, the routine chooses the more "modern"
3466         #                         one to use.  This is because some of the
3467         #                         older files are formatted with values that
3468         #                         are, for example, ALL CAPs, whereas the
3469         #                         derived files have a more modern style,
3470         #                         which looks better.  By looking for this
3471         #                         style when the pre-existing and replacement
3472         #                         standard forms are the same, we can move to
3473         #                         the modern style
3474         #       => $MULTIPLE_BEFORE means that if this range duplicates an
3475         #                         existing one, but has a different value,
3476         #                         don't replace the existing one, but insert
3477         #                         this, one so that the same range can occur
3478         #                         multiple times.  They are stored LIFO, so
3479         #                         that the final one inserted is the first one
3480         #                         returned in an ordered search of the table.
3481         #                         If this is an exact duplicate, including the
3482         #                         value, the original will be moved to be
3483         #                         first, before any other duplicate ranges
3484         #                         with different values.
3485         #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3486         #                         FIFO, so that this one is inserted after all
3487         #                         others that currently exist.  If this is an
3488         #                         exact duplicate, including value, of an
3489         #                         existing range, this one is discarded
3490         #                         (leaving the existing one in its original,
3491         #                         higher priority position
3492         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
3493         #
3494         # "same value" means identical for non-type-0 ranges, and it means
3495         # having the same standard forms for type-0 ranges.
3496
3497         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3498
3499         my $self = shift;
3500         my $operation = shift;   # '+' for add/replace; '-' for delete;
3501         my $start = shift;
3502         my $end   = shift;
3503         my $value = shift;
3504
3505         my %args = @_;
3506
3507         $value = "" if not defined $value;        # warning: $value can be "0"
3508
3509         my $replace = delete $args{'Replace'};
3510         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3511
3512         my $type = delete $args{'Type'};
3513         $type = 0 unless defined $type;
3514
3515         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3516
3517         my $addr = do { no overloading; pack 'J', $self; };
3518
3519         if ($operation ne '+' && $operation ne '-') {
3520             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
3521             return;
3522         }
3523         unless (defined $start && defined $end) {
3524             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
3525             return;
3526         }
3527         unless ($end >= $start) {
3528             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
3529             return;
3530         }
3531         if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
3532             Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ".  Adding it anyway");
3533         }
3534         #local $to_trace = 1 if main::DEBUG;
3535
3536         if ($operation eq '-') {
3537             if ($replace != $IF_NOT_EQUIVALENT) {
3538                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
3539                 $replace = $IF_NOT_EQUIVALENT;
3540             }
3541             if ($type) {
3542                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
3543                 $type = 0;
3544             }
3545             if ($value ne "") {
3546                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
3547                 $value = "";
3548             }
3549         }
3550
3551         my $r = $ranges{$addr};               # The current list of ranges
3552         my $range_list_size = scalar @$r;     # And its size
3553         my $max = $max{$addr};                # The current high code point in
3554                                               # the list of ranges
3555
3556         # Do a special case requiring fewer machine cycles when the new range
3557         # starts after the current highest point.  The Unicode input data is
3558         # structured so this is common.
3559         if ($start > $max) {
3560
3561             trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
3562             return if $operation eq '-'; # Deleting a non-existing range is a
3563                                          # no-op
3564
3565             # If the new range doesn't logically extend the current final one
3566             # in the range list, create a new range at the end of the range
3567             # list.  (max cleverly is initialized to a negative number not
3568             # adjacent to 0 if the range list is empty, so even adding a range
3569             # to an empty range list starting at 0 will have this 'if'
3570             # succeed.)
3571             if ($start > $max + 1        # non-adjacent means can't extend.
3572                 || @{$r}[-1]->value ne $value # values differ, can't extend.
3573                 || @{$r}[-1]->type != $type # types differ, can't extend.
3574             ) {
3575                 push @$r, Range->new($start, $end,
3576                                      Value => $value,
3577                                      Type => $type);
3578             }
3579             else {
3580
3581                 # Here, the new range starts just after the current highest in
3582                 # the range list, and they have the same type and value.
3583                 # Extend the current range to incorporate the new one.
3584                 @{$r}[-1]->set_end($end);
3585             }
3586
3587             # This becomes the new maximum.
3588             $max{$addr} = $end;
3589
3590             return;
3591         }
3592         #local $to_trace = 0 if main::DEBUG;
3593
3594         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3595
3596         # Here, the input range isn't after the whole rest of the range list.
3597         # Most likely 'splice' will be needed.  The rest of the routine finds
3598         # the needed splice parameters, and if necessary, does the splice.
3599         # First, find the offset parameter needed by the splice function for
3600         # the input range.  Note that the input range may span multiple
3601         # existing ones, but we'll worry about that later.  For now, just find
3602         # the beginning.  If the input range is to be inserted starting in a
3603         # position not currently in the range list, it must (obviously) come
3604         # just after the range below it, and just before the range above it.
3605         # Slightly less obviously, it will occupy the position currently
3606         # occupied by the range that is to come after it.  More formally, we
3607         # are looking for the position, $i, in the array of ranges, such that:
3608         #
3609         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3610         #
3611         # (The ordered relationships within existing ranges are also shown in
3612         # the equation above).  However, if the start of the input range is
3613         # within an existing range, the splice offset should point to that
3614         # existing range's position in the list; that is $i satisfies a
3615         # somewhat different equation, namely:
3616         #
3617         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3618         #
3619         # More briefly, $start can come before or after r[$i]->start, and at
3620         # this point, we don't know which it will be.  However, these
3621         # two equations share these constraints:
3622         #
3623         #   r[$i-1]->end < $start <= r[$i]->end
3624         #
3625         # And that is good enough to find $i.
3626
3627         my $i = $self->_search_ranges($start);
3628         if (! defined $i) {
3629             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
3630             return;
3631         }
3632
3633         # The search function returns $i such that:
3634         #
3635         # r[$i-1]->end < $start <= r[$i]->end
3636         #
3637         # That means that $i points to the first range in the range list
3638         # that could possibly be affected by this operation.  We still don't
3639         # know if the start of the input range is within r[$i], or if it
3640         # points to empty space between r[$i-1] and r[$i].
3641         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3642
3643         # Special case the insertion of data that is not to replace any
3644         # existing data.
3645         if ($replace == $NO) {  # If $NO, has to be operation '+'
3646             #local $to_trace = 1 if main::DEBUG;
3647             trace "Doesn't replace" if main::DEBUG && $to_trace;
3648
3649             # Here, the new range is to take effect only on those code points
3650             # that aren't already in an existing range.  This can be done by
3651             # looking through the existing range list and finding the gaps in
3652             # the ranges that this new range affects, and then calling this
3653             # function recursively on each of those gaps, leaving untouched
3654             # anything already in the list.  Gather up a list of the changed
3655             # gaps first so that changes to the internal state as new ranges
3656             # are added won't be a problem.
3657             my @gap_list;
3658
3659             # First, if the starting point of the input range is outside an
3660             # existing one, there is a gap from there to the beginning of the
3661             # existing range -- add a span to fill the part that this new
3662             # range occupies
3663             if ($start < $r->[$i]->start) {
3664                 push @gap_list, Range->new($start,
3665                                            main::min($end,
3666                                                      $r->[$i]->start - 1),
3667                                            Type => $type);
3668                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3669             }
3670
3671             # Then look through the range list for other gaps until we reach
3672             # the highest range affected by the input one.
3673             my $j;
3674             for ($j = $i+1; $j < $range_list_size; $j++) {
3675                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3676                 last if $end < $r->[$j]->start;
3677
3678                 # If there is a gap between when this range starts and the
3679                 # previous one ends, add a span to fill it.  Note that just
3680                 # because there are two ranges doesn't mean there is a
3681                 # non-zero gap between them.  It could be that they have
3682                 # different values or types
3683                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3684                     push @gap_list,
3685                         Range->new($r->[$j-1]->end + 1,
3686                                    $r->[$j]->start - 1,
3687                                    Type => $type);
3688                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3689                 }
3690             }
3691
3692             # Here, we have either found an existing range in the range list,
3693             # beyond the area affected by the input one, or we fell off the
3694             # end of the loop because the input range affects the whole rest
3695             # of the range list.  In either case, $j is 1 higher than the
3696             # highest affected range.  If $j == $i, it means that there are no
3697             # affected ranges, that the entire insertion is in the gap between
3698             # r[$i-1], and r[$i], which we already have taken care of before
3699             # the loop.
3700             # On the other hand, if there are affected ranges, it might be
3701             # that there is a gap that needs filling after the final such
3702             # range to the end of the input range
3703             if ($r->[$j-1]->end < $end) {
3704                     push @gap_list, Range->new(main::max($start,
3705                                                          $r->[$j-1]->end + 1),
3706                                                $end,
3707                                                Type => $type);
3708                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3709             }
3710
3711             # Call recursively to fill in all the gaps.
3712             foreach my $gap (@gap_list) {
3713                 $self->_add_delete($operation,
3714                                    $gap->start,
3715                                    $gap->end,
3716                                    $value,
3717                                    Type => $type);
3718             }
3719
3720             return;
3721         }
3722
3723         # Here, we have taken care of the case where $replace is $NO.
3724         # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3725         # If inserting a multiple record, this is where it goes, before the
3726         # first (if any) existing one if inserting LIFO.  (If this is to go
3727         # afterwards, FIFO, we below move the pointer to there.)  These imply
3728         # an insertion, and no change to any existing ranges.  Note that $i
3729         # can be -1 if this new range doesn't actually duplicate any existing,
3730         # and comes at the beginning of the list.
3731         if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
3732
3733             if ($start != $end) {
3734                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
3735                 return;
3736             }
3737
3738             # If the new code point is within a current range ...
3739             if ($end >= $r->[$i]->start) {
3740
3741                 # Don't add an exact duplicate, as it isn't really a multiple
3742                 my $existing_value = $r->[$i]->value;
3743                 my $existing_type = $r->[$i]->type;
3744                 return if $value eq $existing_value && $type eq $existing_type;
3745
3746                 # If the multiple value is part of an existing range, we want
3747                 # to split up that range, so that only the single code point
3748                 # is affected.  To do this, we first call ourselves
3749                 # recursively to delete that code point from the table, having
3750                 # preserved its current data above.  Then we call ourselves
3751                 # recursively again to add the new multiple, which we know by
3752                 # the test just above is different than the current code
3753                 # point's value, so it will become a range containing a single
3754                 # code point: just itself.  Finally, we add back in the
3755                 # pre-existing code point, which will again be a single code
3756                 # point range.  Because 'i' likely will have changed as a
3757                 # result of these operations, we can't just continue on, but
3758                 # do this operation recursively as well.  If we are inserting
3759                 # LIFO, the pre-existing code point needs to go after the new
3760                 # one, so use MULTIPLE_AFTER; and vice versa.
3761                 if ($r->[$i]->start != $r->[$i]->end) {
3762                     $self->_add_delete('-', $start, $end, "");
3763                     $self->_add_delete('+', $start, $end, $value, Type => $type);
3764                     return $self->_add_delete('+',
3765                             $start, $end,
3766                             $existing_value,
3767                             Type => $existing_type,
3768                             Replace => ($replace == $MULTIPLE_BEFORE)
3769                                        ? $MULTIPLE_AFTER
3770                                        : $MULTIPLE_BEFORE);
3771                 }
3772             }
3773
3774             # If to place this new record after, move to beyond all existing
3775             # ones; but don't add this one if identical to any of them, as it
3776             # isn't really a multiple.  This leaves the original order, so
3777             # that the current request is ignored.  The reasoning is that the
3778             # previous request that wanted this record to have high priority
3779             # should have precedence.
3780             if ($replace == $MULTIPLE_AFTER) {
3781                 while ($i < @$r && $r->[$i]->start == $start) {
3782                     return if $value eq $r->[$i]->value
3783                               && $type eq $r->[$i]->type;
3784                     $i++;
3785                 }
3786             }
3787             else {
3788                 # If instead we are to place this new record before any
3789                 # existing ones, remove any identical ones that come after it.
3790                 # This changes the existing order so that the new one is
3791                 # first, as is being requested.
3792                 for (my $j = $i + 1;
3793                      $j < @$r && $r->[$j]->start == $start;
3794                      $j++)
3795                 {
3796                     if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
3797                         splice @$r, $j, 1;
3798                         last;   # There should only be one instance, so no
3799                                 # need to keep looking
3800                     }
3801                 }
3802             }
3803
3804             trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3805             my @return = splice @$r,
3806                                 $i,
3807                                 0,
3808                                 Range->new($start,
3809                                            $end,
3810                                            Value => $value,
3811                                            Type => $type);
3812             if (main::DEBUG && $to_trace) {
3813                 trace "After splice:";
3814                 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3815                 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3816                 trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
3817                 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3818                 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3819                 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3820             }
3821             return @return;
3822         }
3823
3824         # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
3825         # leaves delete, insert, and replace either unconditionally or if not
3826         # equivalent.  $i still points to the first potential affected range.
3827         # Now find the highest range affected, which will determine the length
3828         # parameter to splice.  (The input range can span multiple existing
3829         # ones.)  If this isn't a deletion, while we are looking through the
3830         # range list, see also if this is a replacement rather than a clean
3831         # insertion; that is if it will change the values of at least one
3832         # existing range.  Start off assuming it is an insert, until find it
3833         # isn't.
3834         my $clean_insert = $operation eq '+';
3835         my $j;        # This will point to the highest affected range
3836
3837         # For non-zero types, the standard form is the value itself;
3838         my $standard_form = ($type) ? $value : main::standardize($value);
3839
3840         for ($j = $i; $j < $range_list_size; $j++) {
3841             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3842
3843             # If find a range that it doesn't overlap into, we can stop
3844             # searching
3845             last if $end < $r->[$j]->start;
3846
3847             # Here, overlaps the range at $j.  If the values don't match,
3848             # and so far we think this is a clean insertion, it becomes a
3849             # non-clean insertion, i.e., a 'change' or 'replace' instead.
3850             if ($clean_insert) {
3851                 if ($r->[$j]->standard_form ne $standard_form) {
3852                     $clean_insert = 0;
3853                     if ($replace == $CROAK) {
3854                         main::croak("The range to add "
3855                         . sprintf("%04X", $start)
3856                         . '-'
3857                         . sprintf("%04X", $end)
3858                         . " with value '$value' overlaps an existing range $r->[$j]");
3859                     }
3860                 }
3861                 else {
3862
3863                     # Here, the two values are essentially the same.  If the
3864                     # two are actually identical, replacing wouldn't change
3865                     # anything so skip it.
3866                     my $pre_existing = $r->[$j]->value;
3867                     if ($pre_existing ne $value) {
3868
3869                         # Here the new and old standardized values are the
3870                         # same, but the non-standardized values aren't.  If
3871                         # replacing unconditionally, then replace
3872                         if( $replace == $UNCONDITIONALLY) {
3873                             $clean_insert = 0;
3874                         }
3875                         else {
3876
3877                             # Here, are replacing conditionally.  Decide to
3878                             # replace or not based on which appears to look
3879                             # the "nicest".  If one is mixed case and the
3880                             # other isn't, choose the mixed case one.
3881                             my $new_mixed = $value =~ /[A-Z]/
3882                                             && $value =~ /[a-z]/;
3883                             my $old_mixed = $pre_existing =~ /[A-Z]/
3884                                             && $pre_existing =~ /[a-z]/;
3885
3886                             if ($old_mixed != $new_mixed) {
3887                                 $clean_insert = 0 if $new_mixed;
3888                                 if (main::DEBUG && $to_trace) {
3889                                     if ($clean_insert) {
3890                                         trace "Retaining $pre_existing over $value";
3891                                     }
3892                                     else {
3893                                         trace "Replacing $pre_existing with $value";
3894                                     }
3895                                 }
3896                             }
3897                             else {
3898
3899                                 # Here casing wasn't different between the two.
3900                                 # If one has hyphens or underscores and the
3901                                 # other doesn't, choose the one with the
3902                                 # punctuation.
3903                                 my $new_punct = $value =~ /[-_]/;
3904                                 my $old_punct = $pre_existing =~ /[-_]/;
3905
3906                                 if ($old_punct != $new_punct) {
3907                                     $clean_insert = 0 if $new_punct;
3908                                     if (main::DEBUG && $to_trace) {
3909                                         if ($clean_insert) {
3910                                             trace "Retaining $pre_existing over $value";
3911                                         }
3912                                         else {
3913                                             trace "Replacing $pre_existing with $value";
3914                                         }
3915                                     }
3916                                 }   # else existing one is just as "good";
3917                                     # retain it to save cycles.
3918                             }
3919                         }
3920                     }
3921                 }
3922             }
3923         } # End of loop looking for highest affected range.
3924
3925         # Here, $j points to one beyond the highest range that this insertion
3926         # affects (hence to beyond the range list if that range is the final
3927         # one in the range list).
3928
3929         # The splice length is all the affected ranges.  Get it before
3930         # subtracting, for efficiency, so we don't have to later add 1.
3931         my $length = $j - $i;
3932
3933         $j--;        # $j now points to the highest affected range.
3934         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3935
3936         # Here, have taken care of $NO and $MULTIPLE_foo replaces.
3937         # $j points to the highest affected range.  But it can be < $i or even
3938         # -1.  These happen only if the insertion is entirely in the gap
3939         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
3940         # above exited first time through with $end < $r->[$i]->start.  (And
3941         # then we subtracted one from j)  This implies also that $start <
3942         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3943         # $start, so the entire input range is in the gap.
3944         if ($j < $i) {
3945
3946             # Here the entire input range is in the gap before $i.
3947
3948             if (main::DEBUG && $to_trace) {
3949                 if ($i) {
3950                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
3951                 }
3952                 else {
3953                     trace "Entire range is before $r->[$i]";
3954                 }
3955             }
3956             return if $operation ne '+'; # Deletion of a non-existent range is
3957                                          # a no-op
3958         }
3959         else {
3960
3961             # Here part of the input range is not in the gap before $i.  Thus,
3962             # there is at least one affected one, and $j points to the highest
3963             # such one.
3964
3965             # At this point, here is the situation:
3966             # This is not an insertion of a multiple, nor of tentative ($NO)
3967             # data.
3968             #   $i  points to the first element in the current range list that
3969             #            may be affected by this operation.  In fact, we know
3970             #            that the range at $i is affected because we are in
3971             #            the else branch of this 'if'
3972             #   $j  points to the highest affected range.
3973             # In other words,
3974             #   r[$i-1]->end < $start <= r[$i]->end
3975             # And:
3976             #   r[$i-1]->end < $start <= $end <= r[$j]->end
3977             #
3978             # Also:
3979             #   $clean_insert is a boolean which is set true if and only if
3980             #        this is a "clean insertion", i.e., not a change nor a
3981             #        deletion (multiple was handled above).
3982
3983             # We now have enough information to decide if this call is a no-op
3984             # or not.  It is a no-op if this is an insertion of already
3985             # existing data.
3986
3987             if (main::DEBUG && $to_trace && $clean_insert
3988                                          && $i == $j
3989                                          && $start >= $r->[$i]->start)
3990             {
3991                     trace "no-op";
3992             }
3993             return if $clean_insert
3994                       && $i == $j # more than one affected range => not no-op
3995
3996                       # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3997                       # Further, $start and/or $end is >= r[$i]->start
3998                       # The test below hence guarantees that
3999                       #     r[$i]->start < $start <= $end <= r[$i]->end
4000                       # This means the input range is contained entirely in
4001                       # the one at $i, so is a no-op
4002                       && $start >= $r->[$i]->start;
4003         }
4004
4005         # Here, we know that some action will have to be taken.  We have
4006         # calculated the offset and length (though adjustments may be needed)
4007         # for the splice.  Now start constructing the replacement list.
4008         my @replacement;
4009         my $splice_start = $i;
4010
4011         my $extends_below;
4012         my $extends_above;
4013
4014         # See if should extend any adjacent ranges.
4015         if ($operation eq '-') { # Don't extend deletions
4016             $extends_below = $extends_above = 0;
4017         }
4018         else {  # Here, should extend any adjacent ranges.  See if there are
4019                 # any.
4020             $extends_below = ($i > 0
4021                             # can't extend unless adjacent
4022                             && $r->[$i-1]->end == $start -1
4023                             # can't extend unless are same standard value
4024                             && $r->[$i-1]->standard_form eq $standard_form
4025                             # can't extend unless share type
4026                             && $r->[$i-1]->type == $type);
4027             $extends_above = ($j+1 < $range_list_size
4028                             && $r->[$j+1]->start == $end +1
4029                             && $r->[$j+1]->standard_form eq $standard_form
4030                             && $r->[$j+1]->type == $type);
4031         }
4032         if ($extends_below && $extends_above) { # Adds to both
4033             $splice_start--;     # start replace at element below
4034             $length += 2;        # will replace on both sides
4035             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4036
4037             # The result will fill in any gap, replacing both sides, and
4038             # create one large range.
4039             @replacement = Range->new($r->[$i-1]->start,
4040                                       $r->[$j+1]->end,
4041                                       Value => $value,
4042                                       Type => $type);
4043         }
4044         else {
4045
4046             # Here we know that the result won't just be the conglomeration of
4047             # a new range with both its adjacent neighbors.  But it could
4048             # extend one of them.
4049
4050             if ($extends_below) {
4051
4052                 # Here the new element adds to the one below, but not to the
4053                 # one above.  If inserting, and only to that one range,  can
4054                 # just change its ending to include the new one.
4055                 if ($length == 0 && $clean_insert) {
4056                     $r->[$i-1]->set_end($end);
4057                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4058                     return;
4059                 }
4060                 else {
4061                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4062                     $splice_start--;        # start replace at element below
4063                     $length++;              # will replace the element below
4064                     $start = $r->[$i-1]->start;
4065                 }
4066             }
4067             elsif ($extends_above) {
4068
4069                 # Here the new element adds to the one above, but not below.
4070                 # Mirror the code above
4071                 if ($length == 0 && $clean_insert) {
4072                     $r->[$j+1]->set_start($start);
4073                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4074                     return;
4075                 }
4076                 else {
4077                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4078                     $length++;        # will replace the element above
4079                     $end = $r->[$j+1]->end;
4080                 }
4081             }
4082
4083             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4084
4085             # Finally, here we know there will have to be a splice.
4086             # If the change or delete affects only the highest portion of the
4087             # first affected range, the range will have to be split.  The
4088             # splice will remove the whole range, but will replace it by a new
4089             # range containing just the unaffected part.  So, in this case,
4090             # add to the replacement list just this unaffected portion.
4091             if (! $extends_below
4092                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4093             {
4094                 push @replacement,
4095                     Range->new($r->[$i]->start,
4096                                $start - 1,
4097                                Value => $r->[$i]->value,
4098                                Type => $r->[$i]->type);
4099             }
4100
4101             # In the case of an insert or change, but not a delete, we have to
4102             # put in the new stuff;  this comes next.
4103             if ($operation eq '+') {
4104                 push @replacement, Range->new($start,
4105                                               $end,
4106                                               Value => $value,
4107                                               Type => $type);
4108             }
4109
4110             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4111             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4112
4113             # And finally, if we're changing or deleting only a portion of the
4114             # highest affected range, it must be split, as the lowest one was.
4115             if (! $extends_above
4116                 && $j >= 0  # Remember that j can be -1 if before first
4117                             # current element
4118                 && $end >= $r->[$j]->start
4119                 && $end < $r->[$j]->end)
4120             {
4121                 push @replacement,
4122                     Range->new($end + 1,
4123                                $r->[$j]->end,
4124                                Value => $r->[$j]->value,
4125                                Type => $r->[$j]->type);
4126             }
4127         }
4128
4129         # And do the splice, as calculated above
4130         if (main::DEBUG && $to_trace) {
4131             trace "replacing $length element(s) at $i with ";
4132             foreach my $replacement (@replacement) {
4133                 trace "    $replacement";
4134             }
4135             trace "Before splice:";
4136             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4137             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4138             trace "i  =[", $i, "]", $r->[$i];
4139             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4140             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4141         }
4142
4143         my @return = splice @$r, $splice_start, $length, @replacement;
4144
4145         if (main::DEBUG && $to_trace) {
4146             trace "After splice:";
4147             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4148             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4149             trace "i  =[", $i, "]", $r->[$i];
4150             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4151             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4152             trace "removed ", @return if @return;
4153         }
4154
4155         # An actual deletion could have changed the maximum in the list.
4156         # There was no deletion if the splice didn't return something, but
4157         # otherwise recalculate it.  This is done too rarely to worry about
4158         # performance.
4159         if ($operation eq '-' && @return) {
4160             $max{$addr} = $r->[-1]->end;
4161         }
4162         return @return;
4163     }
4164
4165     sub reset_each_range {  # reset the iterator for each_range();
4166         my $self = shift;
4167         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4168
4169         no overloading;
4170         undef $each_range_iterator{pack 'J', $self};
4171         return;
4172     }
4173
4174     sub each_range {
4175         # Iterate over each range in a range list.  Results are undefined if
4176         # the range list is changed during the iteration.
4177
4178         my $self = shift;
4179         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4180
4181         my $addr = do { no overloading; pack 'J', $self; };
4182
4183         return if $self->is_empty;
4184
4185         $each_range_iterator{$addr} = -1
4186                                 if ! defined $each_range_iterator{$addr};
4187         $each_range_iterator{$addr}++;
4188         return $ranges{$addr}->[$each_range_iterator{$addr}]
4189                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
4190         undef $each_range_iterator{$addr};
4191         return;
4192     }
4193
4194     sub count {        # Returns count of code points in range list
4195         my $self = shift;
4196         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4197
4198         my $addr = do { no overloading; pack 'J', $self; };
4199
4200         my $count = 0;
4201         foreach my $range (@{$ranges{$addr}}) {
4202             $count += $range->end - $range->start + 1;
4203         }
4204         return $count;
4205     }
4206
4207     sub delete_range {    # Delete a range
4208         my $self = shift;
4209         my $start = shift;
4210         my $end = shift;
4211
4212         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4213
4214         return $self->_add_delete('-', $start, $end, "");
4215     }
4216
4217     sub is_empty { # Returns boolean as to if a range list is empty
4218         my $self = shift;
4219         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4220
4221         no overloading;
4222         return scalar @{$ranges{pack 'J', $self}} == 0;
4223     }
4224
4225     sub hash {
4226         # Quickly returns a scalar suitable for separating tables into
4227         # buckets, i.e. it is a hash function of the contents of a table, so
4228         # there are relatively few conflicts.
4229
4230         my $self = shift;
4231         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4232
4233         my $addr = do { no overloading; pack 'J', $self; };
4234
4235         # These are quickly computable.  Return looks like 'min..max;count'
4236         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4237     }
4238 } # End closure for _Range_List_Base
4239
4240 package Range_List;
4241 use base '_Range_List_Base';
4242
4243 # A Range_List is a range list for match tables; i.e. the range values are
4244 # not significant.  Thus a number of operations can be safely added to it,
4245 # such as inversion, intersection.  Note that union is also an unsafe
4246 # operation when range values are cared about, and that method is in the base
4247 # class, not here.  But things are set up so that that method is callable only
4248 # during initialization.  Only in this derived class, is there an operation
4249 # that combines two tables.  A Range_Map can thus be used to initialize a
4250 # Range_List, and its mappings will be in the list, but are not significant to
4251 # this class.
4252
4253 sub trace { return main::trace(@_); }
4254
4255 { # Closure
4256
4257     use overload
4258         fallback => 0,
4259         '+' => sub { my $self = shift;
4260                     my $other = shift;
4261
4262                     return $self->_union($other)
4263                 },
4264         '+=' => sub { my $self = shift;
4265                     my $other = shift;
4266                     my $reversed = shift;
4267
4268                     if ($reversed) {
4269                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4270                         . ref($other)
4271                         . ' += '
4272                         . ref($self)
4273                         . "'.  undef returned.");
4274                         return;
4275                     }
4276
4277                     return $self->_union($other)
4278                 },
4279         '&' => sub { my $self = shift;
4280                     my $other = shift;
4281
4282                     return $self->_intersect($other, 0);
4283                 },
4284         '&=' => sub { my $self = shift;
4285                     my $other = shift;
4286                     my $reversed = shift;
4287
4288                     if ($reversed) {
4289                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4290                         . ref($other)
4291                         . ' &= '
4292                         . ref($self)
4293                         . "'.  undef returned.");
4294                         return;
4295                     }
4296
4297                     return $self->_intersect($other, 0);
4298                 },
4299         '~' => "_invert",
4300         '-' => "_subtract",
4301     ;
4302
4303     sub _invert {
4304         # Returns a new Range_List that gives all code points not in $self.
4305
4306         my $self = shift;
4307
4308         my $new = Range_List->new;
4309
4310         # Go through each range in the table, finding the gaps between them
4311         my $max = -1;   # Set so no gap before range beginning at 0
4312         for my $range ($self->ranges) {
4313             my $start = $range->start;
4314             my $end   = $range->end;
4315
4316             # If there is a gap before this range, the inverse will contain
4317             # that gap.
4318             if ($start > $max + 1) {
4319                 $new->add_range($max + 1, $start - 1);
4320             }
4321             $max = $end;
4322         }
4323
4324         # And finally, add the gap from the end of the table to the max
4325         # possible code point
4326         if ($max < $MAX_UNICODE_CODEPOINT) {
4327             $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
4328         }
4329         return $new;
4330     }
4331
4332     sub _subtract {
4333         # Returns a new Range_List with the argument deleted from it.  The
4334         # argument can be a single code point, a range, or something that has
4335         # a range, with the _range_list() method on it returning them
4336
4337         my $self = shift;
4338         my $other = shift;
4339         my $reversed = shift;
4340         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4341
4342         if ($reversed) {
4343             Carp::my_carp_bug("Bad news.  Can't cope with '"
4344             . ref($other)
4345             . ' - '
4346             . ref($self)
4347             . "'.  undef returned.");
4348             return;
4349         }
4350
4351         my $new = Range_List->new(Initialize => $self);
4352
4353         if (! ref $other) { # Single code point
4354             $new->delete_range($other, $other);
4355         }
4356         elsif ($other->isa('Range')) {
4357             $new->delete_range($other->start, $other->end);
4358         }
4359         elsif ($other->can('_range_list')) {
4360             foreach my $range ($other->_range_list->ranges) {
4361                 $new->delete_range($range->start, $range->end);
4362             }
4363         }
4364         else {
4365             Carp::my_carp_bug("Can't cope with a "
4366                         . ref($other)
4367                         . " argument to '-'.  Subtraction ignored."
4368                         );
4369             return $self;
4370         }
4371
4372         return $new;
4373     }
4374
4375     sub _intersect {
4376         # Returns either a boolean giving whether the two inputs' range lists
4377         # intersect (overlap), or a new Range_List containing the intersection
4378         # of the two lists.  The optional final parameter being true indicates
4379         # to do the check instead of the intersection.
4380
4381         my $a_object = shift;
4382         my $b_object = shift;
4383         my $check_if_overlapping = shift;
4384         $check_if_overlapping = 0 unless defined $check_if_overlapping;
4385         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4386
4387         if (! defined $b_object) {
4388             my $message = "";
4389             $message .= $a_object->_owner_name_of if defined $a_object;
4390             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4391             return;
4392         }
4393
4394         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4395         # Thus the intersection could be much more simply be written:
4396         #   return ~(~$a_object + ~$b_object);
4397         # But, this is slower, and when taking the inverse of a large
4398         # range_size_1 table, back when such tables were always stored that
4399         # way, it became prohibitively slow, hence the code was changed to the
4400         # below
4401
4402         if ($b_object->isa('Range')) {
4403             $b_object = Range_List->new(Initialize => $b_object,
4404                                         Owner => $a_object->_owner_name_of);
4405         }
4406         $b_object = $b_object->_range_list if $b_object->can('_range_list');
4407
4408         my @a_ranges = $a_object->ranges;
4409         my @b_ranges = $b_object->ranges;
4410
4411         #local $to_trace = 1 if main::DEBUG;
4412         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4413
4414         # Start with the first range in each list
4415         my $a_i = 0;
4416         my $range_a = $a_ranges[$a_i];
4417         my $b_i = 0;
4418         my $range_b = $b_ranges[$b_i];
4419
4420         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4421                                                 if ! $check_if_overlapping;
4422
4423         # If either list is empty, there is no intersection and no overlap
4424         if (! defined $range_a || ! defined $range_b) {
4425             return $check_if_overlapping ? 0 : $new;
4426         }
4427         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4428
4429         # Otherwise, must calculate the intersection/overlap.  Start with the
4430         # very first code point in each list
4431         my $a = $range_a->start;
4432         my $b = $range_b->start;
4433
4434         # Loop through all the ranges of each list; in each iteration, $a and
4435         # $b are the current code points in their respective lists
4436         while (1) {
4437
4438             # If $a and $b are the same code point, ...
4439             if ($a == $b) {
4440
4441                 # it means the lists overlap.  If just checking for overlap
4442                 # know the answer now,
4443                 return 1 if $check_if_overlapping;
4444
4445                 # The intersection includes this code point plus anything else
4446                 # common to both current ranges.
4447                 my $start = $a;
4448                 my $end = main::min($range_a->end, $range_b->end);
4449                 if (! $check_if_overlapping) {
4450                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4451                     $new->add_range($start, $end);
4452                 }
4453
4454                 # Skip ahead to the end of the current intersect
4455                 $a = $b = $end;
4456
4457                 # If the current intersect ends at the end of either range (as
4458                 # it must for at least one of them), the next possible one
4459                 # will be the beginning code point in it's list's next range.
4460                 if ($a == $range_a->end) {
4461                     $range_a = $a_ranges[++$a_i];
4462                     last unless defined $range_a;
4463                     $a = $range_a->start;
4464                 }
4465                 if ($b == $range_b->end) {
4466                     $range_b = $b_ranges[++$b_i];
4467                     last unless defined $range_b;
4468                     $b = $range_b->start;
4469                 }
4470
4471                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4472             }
4473             elsif ($a < $b) {
4474
4475                 # Not equal, but if the range containing $a encompasses $b,
4476                 # change $a to be the middle of the range where it does equal
4477                 # $b, so the next iteration will get the intersection
4478                 if ($range_a->end >= $b) {
4479                     $a = $b;
4480                 }
4481                 else {
4482
4483                     # Here, the current range containing $a is entirely below
4484                     # $b.  Go try to find a range that could contain $b.
4485                     $a_i = $a_object->_search_ranges($b);
4486
4487                     # If no range found, quit.
4488                     last unless defined $a_i;
4489
4490                     # The search returns $a_i, such that
4491                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4492                     # Set $a to the beginning of this new range, and repeat.
4493                     $range_a = $a_ranges[$a_i];
4494                     $a = $range_a->start;
4495                 }
4496             }
4497             else { # Here, $b < $a.
4498
4499                 # Mirror image code to the leg just above
4500                 if ($range_b->end >= $a) {
4501                     $b = $a;
4502                 }
4503                 else {
4504                     $b_i = $b_object->_search_ranges($a);
4505                     last unless defined $b_i;
4506                     $range_b = $b_ranges[$b_i];
4507                     $b = $range_b->start;
4508                 }
4509             }
4510         } # End of looping through ranges.
4511
4512         # Intersection fully computed, or now know that there is no overlap
4513         return $check_if_overlapping ? 0 : $new;
4514     }
4515
4516     sub overlaps {
4517         # Returns boolean giving whether the two arguments overlap somewhere
4518
4519         my $self = shift;
4520         my $other = shift;
4521         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4522
4523         return $self->_intersect($other, 1);
4524     }
4525
4526     sub add_range {
4527         # Add a range to the list.
4528
4529         my $self = shift;
4530         my $start = shift;
4531         my $end = shift;
4532         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4533
4534         return $self->_add_delete('+', $start, $end, "");
4535     }
4536
4537     sub matches_identically_to {
4538         # Return a boolean as to whether or not two Range_Lists match identical
4539         # sets of code points.
4540
4541         my $self = shift;
4542         my $other = shift;
4543         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4544
4545         # These are ordered in increasing real time to figure out (at least
4546         # until a patch changes that and doesn't change this)
4547         return 0 if $self->max != $other->max;
4548         return 0 if $self->min != $other->min;
4549         return 0 if $self->range_count != $other->range_count;
4550         return 0 if $self->count != $other->count;
4551
4552         # Here they could be identical because all the tests above passed.
4553         # The loop below is somewhat simpler since we know they have the same
4554         # number of elements.  Compare range by range, until reach the end or
4555         # find something that differs.
4556         my @a_ranges = $self->ranges;
4557         my @b_ranges = $other->ranges;
4558         for my $i (0 .. @a_ranges - 1) {
4559             my $a = $a_ranges[$i];
4560             my $b = $b_ranges[$i];
4561             trace "self $a; other $b" if main::DEBUG && $to_trace;
4562             return 0 if ! defined $b
4563                         || $a->start != $b->start
4564                         || $a->end != $b->end;
4565         }
4566         return 1;
4567     }
4568
4569     sub is_code_point_usable {
4570         # This used only for making the test script.  See if the input
4571         # proposed trial code point is one that Perl will handle.  If second
4572         # parameter is 0, it won't select some code points for various
4573         # reasons, noted below.
4574
4575         my $code = shift;
4576         my $try_hard = shift;
4577         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4578
4579         return 0 if $code < 0;                # Never use a negative
4580
4581         # shun null.  I'm (khw) not sure why this was done, but NULL would be
4582         # the character very frequently used.
4583         return $try_hard if $code == 0x0000;
4584
4585         # shun non-character code points.
4586         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4587         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4588
4589         return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
4590         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4591
4592         return 1;
4593     }
4594
4595     sub get_valid_code_point {
4596         # Return a code point that's part of the range list.  Returns nothing
4597         # if the table is empty or we can't find a suitable code point.  This
4598         # used only for making the test script.
4599
4600         my $self = shift;
4601         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4602
4603         my $addr = do { no overloading; pack 'J', $self; };
4604
4605         # On first pass, don't choose less desirable code points; if no good
4606         # one is found, repeat, allowing a less desirable one to be selected.
4607         for my $try_hard (0, 1) {
4608
4609             # Look through all the ranges for a usable code point.
4610             for my $set (reverse $self->ranges) {
4611
4612                 # Try the edge cases first, starting with the end point of the
4613                 # range.
4614                 my $end = $set->end;
4615                 return $end if is_code_point_usable($end, $try_hard);
4616
4617                 # End point didn't, work.  Start at the beginning and try
4618                 # every one until find one that does work.
4619                 for my $trial ($set->start .. $end - 1) {
4620                     return $trial if is_code_point_usable($trial, $try_hard);
4621                 }
4622             }
4623         }
4624         return ();  # If none found, give up.
4625     }
4626
4627     sub get_invalid_code_point {
4628         # Return a code point that's not part of the table.  Returns nothing
4629         # if the table covers all code points or a suitable code point can't
4630         # be found.  This used only for making the test script.
4631
4632         my $self = shift;
4633         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4634
4635         # Just find a valid code point of the inverse, if any.
4636         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4637     }
4638 } # end closure for Range_List
4639
4640 package Range_Map;
4641 use base '_Range_List_Base';
4642
4643 # A Range_Map is a range list in which the range values (called maps) are
4644 # significant, and hence shouldn't be manipulated by our other code, which
4645 # could be ambiguous or lose things.  For example, in taking the union of two
4646 # lists, which share code points, but which have differing values, which one
4647 # has precedence in the union?
4648 # It turns out that these operations aren't really necessary for map tables,
4649 # and so this class was created to make sure they aren't accidentally
4650 # applied to them.
4651
4652 { # Closure
4653
4654     sub add_map {
4655         # Add a range containing a mapping value to the list
4656
4657         my $self = shift;
4658         # Rest of parameters passed on
4659
4660         return $self->_add_delete('+', @_);
4661     }
4662
4663     sub add_duplicate {
4664         # Adds entry to a range list which can duplicate an existing entry
4665
4666         my $self = shift;
4667         my $code_point = shift;
4668         my $value = shift;
4669         my %args = @_;
4670         my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4671         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4672
4673         return $self->add_map($code_point, $code_point,
4674                                 $value, Replace => $replace);
4675     }
4676 } # End of closure for package Range_Map
4677
4678 package _Base_Table;
4679
4680 # A table is the basic data structure that gets written out into a file for
4681 # use by the Perl core.  This is the abstract base class implementing the
4682 # common elements from the derived ones.  A list of the methods to be
4683 # furnished by an implementing class is just after the constructor.
4684
4685 sub standardize { return main::standardize($_[0]); }
4686 sub trace { return main::trace(@_); }
4687
4688 { # Closure
4689
4690     main::setup_package();
4691
4692     my %range_list;
4693     # Object containing the ranges of the table.
4694     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4695
4696     my %full_name;
4697     # The full table name.
4698     main::set_access('full_name', \%full_name, 'r');
4699
4700     my %name;
4701     # The table name, almost always shorter
4702     main::set_access('name', \%name, 'r');
4703
4704     my %short_name;
4705     # The shortest of all the aliases for this table, with underscores removed
4706     main::set_access('short_name', \%short_name);
4707
4708     my %nominal_short_name_length;
4709     # The length of short_name before removing underscores
4710     main::set_access('nominal_short_name_length',
4711                     \%nominal_short_name_length);
4712
4713     my %complete_name;
4714     # The complete name, including property.
4715     main::set_access('complete_name', \%complete_name, 'r');
4716
4717     my %property;
4718     # Parent property this table is attached to.
4719     main::set_access('property', \%property, 'r');
4720
4721     my %aliases;
4722     # Ordered list of alias objects of the table's name.  The first ones in
4723     # the list are output first in comments
4724     main::set_access('aliases', \%aliases, 'readable_array');
4725
4726     my %comment;
4727     # A comment associated with the table for human readers of the files
4728     main::set_access('comment', \%comment, 's');
4729
4730     my %description;
4731     # A comment giving a short description of the table's meaning for human
4732     # readers of the files.
4733     main::set_access('description', \%description, 'readable_array');
4734
4735     my %note;
4736     # A comment giving a short note about the table for human readers of the
4737     # files.
4738     main::set_access('note', \%note, 'readable_array');
4739
4740     my %fate;
4741     # Enum; there are a number of possibilities for what happens to this
4742     # table: it could be normal, or suppressed, or not for external use.  See
4743     # values at definition for $SUPPRESSED.
4744     main::set_access('fate', \%fate, 'r');
4745
4746     my %find_table_from_alias;
4747     # The parent property passes this pointer to a hash which this class adds
4748     # all its aliases to, so that the parent can quickly take an alias and
4749     # find this table.
4750     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4751
4752     my %locked;
4753     # After this table is made equivalent to another one; we shouldn't go
4754     # changing the contents because that could mean it's no longer equivalent
4755     main::set_access('locked', \%locked, 'r');
4756
4757     my %file_path;
4758     # This gives the final path to the file containing the table.  Each
4759     # directory in the path is an element in the array
4760     main::set_access('file_path', \%file_path, 'readable_array');
4761
4762     my %status;
4763     # What is the table's status, normal, $OBSOLETE, etc.  Enum
4764     main::set_access('status', \%status, 'r');
4765
4766     my %status_info;
4767     # A comment about its being obsolete, or whatever non normal status it has
4768     main::set_access('status_info', \%status_info, 'r');
4769
4770     my %caseless_equivalent;
4771     # The table this is equivalent to under /i matching, if any.
4772     main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4773
4774     my %range_size_1;
4775     # Is the table to be output with each range only a single code point?
4776     # This is done to avoid breaking existing code that may have come to rely
4777     # on this behavior in previous versions of this program.)
4778     main::set_access('range_size_1', \%range_size_1, 'r', 's');
4779
4780     my %perl_extension;
4781     # A boolean set iff this table is a Perl extension to the Unicode
4782     # standard.
4783     main::set_access('perl_extension', \%perl_extension, 'r');
4784
4785     my %output_range_counts;
4786     # A boolean set iff this table is to have comments written in the
4787     # output file that contain the number of code points in the range.
4788     # The constructor can override the global flag of the same name.
4789     main::set_access('output_range_counts', \%output_range_counts, 'r');
4790
4791     my %format;
4792     # The format of the entries of the table.  This is calculated from the
4793     # data in the table (or passed in the constructor).  This is an enum e.g.,
4794     # $STRING_FORMAT.  It is marked protected as it should not be generally
4795     # used to override calculations.
4796     main::set_access('format', \%format, 'r', 'p_s');
4797
4798     sub new {
4799         # All arguments are key => value pairs, which you can see below, most
4800         # of which match fields documented above.  Otherwise: Re_Pod_Entry,
4801         # OK_as_Filename, and Fuzzy apply to the names of the table, and are
4802         # documented in the Alias package
4803
4804         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4805
4806         my $class = shift;
4807
4808         my $self = bless \do { my $anonymous_scalar }, $class;
4809         my $addr = do { no overloading; pack 'J', $self; };
4810
4811         my %args = @_;
4812
4813         $name{$addr} = delete $args{'Name'};
4814         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4815         $full_name{$addr} = delete $args{'Full_Name'};
4816         my $complete_name = $complete_name{$addr}
4817                           = delete $args{'Complete_Name'};
4818         $format{$addr} = delete $args{'Format'};
4819         $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4820         $property{$addr} = delete $args{'_Property'};
4821         $range_list{$addr} = delete $args{'_Range_List'};
4822         $status{$addr} = delete $args{'Status'} || $NORMAL;
4823         $status_info{$addr} = delete $args{'_Status_Info'} || "";
4824         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4825         $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
4826         $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
4827         my $ucd = delete $args{'UCD'};
4828
4829         my $description = delete $args{'Description'};
4830         my $ok_as_filename = delete $args{'OK_as_Filename'};
4831         my $loose_match = delete $args{'Fuzzy'};
4832         my $note = delete $args{'Note'};
4833         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4834         my $perl_extension = delete $args{'Perl_Extension'};
4835
4836         # Shouldn't have any left over
4837         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4838
4839         # Can't use || above because conceivably the name could be 0, and
4840         # can't use // operator in case this program gets used in Perl 5.8
4841         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4842         $output_range_counts{$addr} = $output_range_counts if
4843                                         ! defined $output_range_counts{$addr};
4844
4845         $aliases{$addr} = [ ];
4846         $comment{$addr} = [ ];
4847         $description{$addr} = [ ];
4848         $note{$addr} = [ ];
4849         $file_path{$addr} = [ ];
4850         $locked{$addr} = "";
4851
4852         push @{$description{$addr}}, $description if $description;
4853         push @{$note{$addr}}, $note if $note;
4854
4855         if ($fate{$addr} == $PLACEHOLDER) {
4856
4857             # A placeholder table doesn't get documented, is a perl extension,
4858             # and quite likely will be empty
4859             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4860             $perl_extension = 1 if ! defined $perl_extension;
4861             $ucd = 0 if ! defined $ucd;
4862             push @tables_that_may_be_empty, $complete_name{$addr};
4863             $self->add_comment(<<END);
4864 This is a placeholder because it is not in Version $string_version of Unicode,
4865 but is needed by the Perl core to work gracefully.  Because it is not in this
4866 version of Unicode, it will not be listed in $pod_file.pod
4867 END
4868         }
4869         elsif (exists $why_suppressed{$complete_name}
4870                 # Don't suppress if overridden
4871                 && ! grep { $_ eq $complete_name{$addr} }
4872                                                     @output_mapped_properties)
4873         {
4874             $fate{$addr} = $SUPPRESSED;
4875         }
4876         elsif ($fate{$addr} == $SUPPRESSED
4877                && ! exists $why_suppressed{$property{$addr}->complete_name})
4878         {
4879             Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
4880             # perhaps Fate => [ $SUPPRESSED, "reason" ]
4881         }
4882
4883         # If hasn't set its status already, see if it is on one of the
4884         # lists of properties or tables that have particular statuses; if
4885         # not, is normal.  The lists are prioritized so the most serious
4886         # ones are checked first
4887         if (! $status{$addr}) {
4888             if (exists $why_deprecated{$complete_name}) {
4889                 $status{$addr} = $DEPRECATED;
4890             }
4891             elsif (exists $why_stabilized{$complete_name}) {
4892                 $status{$addr} = $STABILIZED;
4893             }
4894             elsif (exists $why_obsolete{$complete_name}) {
4895                 $status{$addr} = $OBSOLETE;
4896             }
4897
4898             # Existence above doesn't necessarily mean there is a message
4899             # associated with it.  Use the most serious message.
4900             if ($status{$addr}) {
4901                 if ($why_deprecated{$complete_name}) {
4902                     $status_info{$addr}
4903                                 = $why_deprecated{$complete_name};
4904                 }
4905                 elsif ($why_stabilized{$complete_name}) {
4906                     $status_info{$addr}
4907                                 = $why_stabilized{$complete_name};
4908                 }
4909                 elsif ($why_obsolete{$complete_name}) {
4910                     $status_info{$addr}
4911                                 = $why_obsolete{$complete_name};
4912                 }
4913             }
4914         }
4915
4916         $perl_extension{$addr} = $perl_extension || 0;
4917
4918         # Don't list a property by default that is internal only
4919         if ($fate{$addr} > $MAP_PROXIED) {
4920             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4921             $ucd = 0 if ! defined $ucd;
4922         }
4923         else {
4924             $ucd = 1 if ! defined $ucd;
4925         }
4926
4927         # By convention what typically gets printed only or first is what's
4928         # first in the list, so put the full name there for good output
4929         # clarity.  Other routines rely on the full name being first on the
4930         # list
4931         $self->add_alias($full_name{$addr},
4932                             OK_as_Filename => $ok_as_filename,
4933                             Fuzzy => $loose_match,
4934                             Re_Pod_Entry => $make_re_pod_entry,
4935                             Status => $status{$addr},
4936                             UCD => $ucd,
4937                             );
4938
4939         # Then comes the other name, if meaningfully different.
4940         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4941             $self->add_alias($name{$addr},
4942                             OK_as_Filename => $ok_as_filename,
4943                             Fuzzy => $loose_match,
4944                             Re_Pod_Entry => $make_re_pod_entry,
4945                             Status => $status{$addr},
4946                             UCD => $ucd,
4947                             );
4948         }
4949
4950         return $self;
4951     }
4952
4953     # Here are the methods that are required to be defined by any derived
4954     # class
4955     for my $sub (qw(
4956                     handle_special_range
4957                     append_to_body
4958                     pre_body
4959                 ))
4960                 # write() knows how to write out normal ranges, but it calls
4961                 # handle_special_range() when it encounters a non-normal one.
4962                 # append_to_body() is called by it after it has handled all
4963                 # ranges to add anything after the main portion of the table.
4964                 # And finally, pre_body() is called after all this to build up
4965                 # anything that should appear before the main portion of the
4966                 # table.  Doing it this way allows things in the middle to
4967                 # affect what should appear before the main portion of the
4968                 # table.
4969     {
4970         no strict "refs";
4971         *$sub = sub {
4972             Carp::my_carp_bug( __LINE__
4973                               . ": Must create method '$sub()' for "
4974                               . ref shift);
4975             return;
4976         }
4977     }
4978
4979     use overload
4980         fallback => 0,
4981         "." => \&main::_operator_dot,
4982         ".=" => \&main::_operator_dot_equal,
4983         '!=' => \&main::_operator_not_equal,
4984         '==' => \&main::_operator_equal,
4985     ;
4986
4987     sub ranges {
4988         # Returns the array of ranges associated with this table.
4989
4990         no overloading;
4991         return $range_list{pack 'J', shift}->ranges;
4992     }
4993
4994     sub add_alias {
4995         # Add a synonym for this table.
4996
4997         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4998
4999         my $self = shift;
5000         my $name = shift;       # The name to add.
5001         my $pointer = shift;    # What the alias hash should point to.  For
5002                                 # map tables, this is the parent property;
5003                                 # for match tables, it is the table itself.
5004
5005         my %args = @_;
5006         my $loose_match = delete $args{'Fuzzy'};
5007
5008         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5009         $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
5010
5011         my $ok_as_filename = delete $args{'OK_as_Filename'};
5012         $ok_as_filename = 1 unless defined $ok_as_filename;
5013
5014         my $status = delete $args{'Status'};
5015         $status = $NORMAL unless defined $status;
5016
5017         # An internal name does not get documented, unless overridden by the
5018         # input.
5019         my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5020
5021         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5022
5023         # Capitalize the first letter of the alias unless it is one of the CJK
5024         # ones which specifically begins with a lower 'k'.  Do this because
5025         # Unicode has varied whether they capitalize first letters or not, and
5026         # have later changed their minds and capitalized them, but not the
5027         # other way around.  So do it always and avoid changes from release to
5028         # release
5029         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5030
5031         my $addr = do { no overloading; pack 'J', $self; };
5032
5033         # Figure out if should be loosely matched if not already specified.
5034         if (! defined $loose_match) {
5035
5036             # Is a loose_match if isn't null, and doesn't begin with an
5037             # underscore and isn't just a number
5038             if ($name ne ""
5039                 && substr($name, 0, 1) ne '_'
5040                 && $name !~ qr{^[0-9_.+-/]+$})
5041             {
5042                 $loose_match = 1;
5043             }
5044             else {
5045                 $loose_match = 0;
5046             }
5047         }
5048
5049         # If this alias has already been defined, do nothing.
5050         return if defined $find_table_from_alias{$addr}->{$name};
5051
5052         # That includes if it is standardly equivalent to an existing alias,
5053         # in which case, add this name to the list, so won't have to search
5054         # for it again.
5055         my $standard_name = main::standardize($name);
5056         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5057             $find_table_from_alias{$addr}->{$name}
5058                         = $find_table_from_alias{$addr}->{$standard_name};
5059             return;
5060         }
5061
5062         # Set the index hash for this alias for future quick reference.
5063         $find_table_from_alias{$addr}->{$name} = $pointer;
5064         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5065         local $to_trace = 0 if main::DEBUG;
5066         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5067         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5068
5069
5070         # Put the new alias at the end of the list of aliases unless the final
5071         # element begins with an underscore (meaning it is for internal perl
5072         # use) or is all numeric, in which case, put the new one before that
5073         # one.  This floats any all-numeric or underscore-beginning aliases to
5074         # the end.  This is done so that they are listed last in output lists,
5075         # to encourage the user to use a better name (either more descriptive
5076         # or not an internal-only one) instead.  This ordering is relied on
5077         # implicitly elsewhere in this program, like in short_name()
5078         my $list = $aliases{$addr};
5079         my $insert_position = (@$list == 0
5080                                 || (substr($list->[-1]->name, 0, 1) ne '_'
5081                                     && $list->[-1]->name =~ /\D/))
5082                             ? @$list
5083                             : @$list - 1;
5084         splice @$list,
5085                 $insert_position,
5086                 0,
5087                 Alias->new($name, $loose_match, $make_re_pod_entry,
5088                                                 $ok_as_filename, $status, $ucd);
5089
5090         # This name may be shorter than any existing ones, so clear the cache
5091         # of the shortest, so will have to be recalculated.
5092         no overloading;
5093         undef $short_name{pack 'J', $self};
5094         return;
5095     }
5096
5097     sub short_name {
5098         # Returns a name suitable for use as the base part of a file name.
5099         # That is, shorter wins.  It can return undef if there is no suitable
5100         # name.  The name has all non-essential underscores removed.
5101
5102         # The optional second parameter is a reference to a scalar in which
5103         # this routine will store the length the returned name had before the
5104         # underscores were removed, or undef if the return is undef.
5105
5106         # The shortest name can change if new aliases are added.  So using
5107         # this should be deferred until after all these are added.  The code
5108         # that does that should clear this one's cache.
5109         # Any name with alphabetics is preferred over an all numeric one, even
5110         # if longer.
5111
5112         my $self = shift;
5113         my $nominal_length_ptr = shift;
5114         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5115
5116         my $addr = do { no overloading; pack 'J', $self; };
5117
5118         # For efficiency, don't recalculate, but this means that adding new
5119         # aliases could change what the shortest is, so the code that does
5120         # that needs to undef this.
5121         if (defined $short_name{$addr}) {
5122             if ($nominal_length_ptr) {
5123                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5124             }
5125             return $short_name{$addr};
5126         }
5127
5128         # Look at each alias
5129         foreach my $alias ($self->aliases()) {
5130
5131             # Don't use an alias that isn't ok to use for an external name.
5132             next if ! $alias->ok_as_filename;
5133
5134             my $name = main::Standardize($alias->name);
5135             trace $self, $name if main::DEBUG && $to_trace;
5136
5137             # Take the first one, or a shorter one that isn't numeric.  This
5138             # relies on numeric aliases always being last in the array
5139             # returned by aliases().  Any alpha one will have precedence.
5140             if (! defined $short_name{$addr}
5141                 || ($name =~ /\D/
5142                     && length($name) < length($short_name{$addr})))
5143             {
5144                 # Remove interior underscores.
5145                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5146
5147                 $nominal_short_name_length{$addr} = length $name;
5148             }
5149         }
5150
5151         # If the short name isn't a nice one, perhaps an equivalent table has
5152         # a better one.
5153         if (! defined $short_name{$addr}
5154             || $short_name{$addr} eq ""
5155             || $short_name{$addr} eq "_")
5156         {
5157             my $return;
5158             foreach my $follower ($self->children) {    # All equivalents
5159                 my $follower_name = $follower->short_name;
5160                 next unless defined $follower_name;
5161
5162                 # Anything (except undefined) is better than underscore or
5163                 # empty
5164                 if (! defined $return || $return eq "_") {
5165                     $return = $follower_name;
5166                     next;
5167                 }
5168
5169                 # If the new follower name isn't "_" and is shorter than the
5170                 # current best one, prefer the new one.
5171                 next if $follower_name eq "_";
5172                 next if length $follower_name > length $return;
5173                 $return = $follower_name;
5174             }
5175             $short_name{$addr} = $return if defined $return;
5176         }
5177
5178         # If no suitable external name return undef
5179         if (! defined $short_name{$addr}) {
5180             $$nominal_length_ptr = undef if $nominal_length_ptr;
5181             return;
5182         }
5183
5184         # Don't allow a null short name.
5185         if ($short_name{$addr} eq "") {
5186             $short_name{$addr} = '_';
5187             $nominal_short_name_length{$addr} = 1;
5188         }
5189
5190         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5191
5192         if ($nominal_length_ptr) {
5193             $$nominal_length_ptr = $nominal_short_name_length{$addr};
5194         }
5195         return $short_name{$addr};
5196     }
5197
5198     sub external_name {
5199         # Returns the external name that this table should be known by.  This
5200         # is usually the short_name, but not if the short_name is undefined,
5201         # in which case the external_name is arbitrarily set to the
5202         # underscore.
5203
5204         my $self = shift;
5205         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5206
5207         my $short = $self->short_name;
5208         return $short if defined $short;
5209
5210         return '_';
5211     }
5212
5213     sub add_description { # Adds the parameter as a short description.
5214
5215         my $self = shift;
5216         my $description = shift;
5217         chomp $description;
5218         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5219
5220         no overloading;
5221         push @{$description{pack 'J', $self}}, $description;
5222
5223         return;
5224     }
5225
5226     sub add_note { # Adds the parameter as a short note.
5227
5228         my $self = shift;
5229         my $note = shift;
5230         chomp $note;
5231         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5232
5233         no overloading;
5234         push @{$note{pack 'J', $self}}, $note;
5235
5236         return;
5237     }
5238
5239     sub add_comment { # Adds the parameter as a comment.
5240
5241         return unless $debugging_build;
5242
5243         my $self = shift;
5244         my $comment = shift;
5245         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5246
5247         chomp $comment;
5248
5249         no overloading;
5250         push @{$comment{pack 'J', $self}}, $comment;
5251
5252         return;
5253     }
5254
5255     sub comment {
5256         # Return the current comment for this table.  If called in list
5257         # context, returns the array of comments.  In scalar, returns a string
5258         # of each element joined together with a period ending each.
5259
5260         my $self = shift;
5261         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5262
5263         my $addr = do { no overloading; pack 'J', $self; };
5264         my @list = @{$comment{$addr}};
5265         return @list if wantarray;
5266         my $return = "";
5267         foreach my $sentence (@list) {
5268             $return .= '.  ' if $return;
5269             $return .= $sentence;
5270             $return =~ s/\.$//;
5271         }
5272         $return .= '.' if $return;
5273         return $return;
5274     }
5275
5276     sub initialize {
5277         # Initialize the table with the argument which is any valid
5278         # initialization for range lists.
5279
5280         my $self = shift;
5281         my $addr = do { no overloading; pack 'J', $self; };
5282         my $initialization = shift;
5283         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5284
5285         # Replace the current range list with a new one of the same exact
5286         # type.
5287         my $class = ref $range_list{$addr};
5288         $range_list{$addr} = $class->new(Owner => $self,
5289                                         Initialize => $initialization);
5290         return;
5291
5292     }
5293
5294     sub header {
5295         # The header that is output for the table in the file it is written
5296         # in.
5297
5298         my $self = shift;
5299         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5300
5301         my $return = "";
5302         $return .= $DEVELOPMENT_ONLY if $compare_versions;
5303         $return .= $HEADER;
5304         return $return;
5305     }
5306
5307     sub write {
5308         # Write a representation of the table to its file.  It calls several
5309         # functions furnished by sub-classes of this abstract base class to
5310         # handle non-normal ranges, to add stuff before the table, and at its
5311         # end.  If the table is to be written so that adjustments are
5312         # required, this does that conversion.
5313
5314         my $self = shift;
5315         my $use_adjustments = shift; # ? output in adjusted format or not
5316         my $tab_stops = shift;       # The number of tab stops over to put any
5317                                      # comment.
5318         my $suppress_value = shift;  # Optional, if the value associated with
5319                                      # a range equals this one, don't write
5320                                      # the range
5321         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5322
5323         my $addr = do { no overloading; pack 'J', $self; };
5324
5325         # Start with the header
5326         my @HEADER = $self->header;
5327
5328         # Then the comments
5329         push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5330                                                         if $comment{$addr};
5331
5332         # Things discovered processing the main body of the document may
5333         # affect what gets output before it, therefore pre_body() isn't called
5334         # until after all other processing of the table is done.
5335
5336         # The main body looks like a 'here' document.  If annotating, get rid
5337         # of the comments before passing to the caller, as some callers, such
5338         # as charnames.pm, can't cope with them.  (Outputting range counts
5339         # also introduces comments, but these don't show up in the tables that
5340         # can't cope with comments, and there aren't that many of them that
5341         # it's worth the extra real time to get rid of them).
5342         my @OUT;
5343         if ($annotate) {
5344             # Use the line below in Perls that don't have /r
5345             #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5346             push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5347         } else {
5348             push @OUT, "return <<'END';\n";
5349         }
5350
5351         if ($range_list{$addr}->is_empty) {
5352
5353             # This is a kludge for empty tables to silence a warning in
5354             # utf8.c, which can't really deal with empty tables, but it can
5355             # deal with a table that matches nothing, as the inverse of 'Any'
5356             # does.
5357             push @OUT, "!utf8::Any\n";
5358         }
5359         elsif ($self->name eq 'N'
5360
5361                # To save disk space and table cache space, avoid putting out
5362                # binary N tables, but instead create a file which just inverts
5363                # the Y table.  Since the file will still exist and occupy a
5364                # certain number of blocks, might as well output the whole
5365                # thing if it all will fit in one block.   The number of
5366                # ranges below is an approximate number for that.
5367                && ($self->property->type == $BINARY
5368                    || $self->property->type == $FORCED_BINARY)
5369                # && $self->property->tables == 2  Can't do this because the
5370                #        non-binary properties, like NFDQC aren't specifiable
5371                #        by the notation
5372                && $range_list{$addr}->ranges > 15
5373                && ! $annotate)  # Under --annotate, want to see everything
5374         {
5375             push @OUT, "!utf8::" . $self->property->name . "\n";
5376         }
5377         else {
5378             my $range_size_1 = $range_size_1{$addr};
5379             my $format;            # Used only in $annotate option
5380             my $include_name;      # Used only in $annotate option
5381
5382             if ($annotate) {
5383
5384                 # If annotating each code point, must print 1 per line.
5385                 # The variable could point to a subroutine, and we don't want
5386                 # to lose that fact, so only set if not set already
5387                 $range_size_1 = 1 if ! $range_size_1;
5388
5389                 $format = $self->format;
5390
5391                 # The name of the character is output only for tables that
5392                 # don't already include the name in the output.
5393                 my $property = $self->property;
5394                 $include_name =
5395                     !  ($property == $perl_charname
5396                         || $property == main::property_ref('Unicode_1_Name')
5397                         || $property == main::property_ref('Name')
5398                         || $property == main::property_ref('Name_Alias')
5399                        );
5400             }
5401
5402             # Values for previous time through the loop.  Initialize to
5403             # something that won't be adjacent to the first iteration;
5404             # only $previous_end matters for that.
5405             my $previous_start;
5406             my $previous_end = -2;
5407             my $previous_value;
5408
5409             # Values for next time through the portion of the loop that splits
5410             # the range.  0 in $next_start means there is no remaining portion
5411             # to deal with.
5412             my $next_start = 0;
5413             my $next_end;
5414             my $next_value;
5415             my $offset = 0;
5416
5417             # Output each range as part of the here document.
5418             RANGE:
5419             for my $set ($range_list{$addr}->ranges) {
5420                 if ($set->type != 0) {
5421                     $self->handle_special_range($set);
5422                     next RANGE;
5423                 }
5424                 my $start = $set->start;
5425                 my $end   = $set->end;
5426                 my $value  = $set->value;
5427
5428                 # Don't output ranges whose value is the one to suppress
5429                 next RANGE if defined $suppress_value
5430                               && $value eq $suppress_value;
5431
5432                 {   # This bare block encloses the scope where we may need to
5433                     # split a range (when outputting adjusteds), and each time
5434                     # through we handle the next portion of the original by
5435                     # ending the block with a 'redo'.   The values to use for
5436                     # that next time through are set up just below in the
5437                     # scalars whose names begin with '$next_'.
5438
5439                     if ($use_adjustments) {
5440
5441                         # When converting to use adjustments, we can handle
5442                         # only single element ranges.  Set up so that this
5443                         # time through the loop, we look at the first element,
5444                         # and the next time through, we start off with the
5445                         # remainder.  Thus each time through we look at the
5446                         # first element of the range
5447                         if ($end != $start) {
5448                             $next_start = $start + 1;
5449                             $next_end = $end;
5450                             $next_value = $value;
5451                             $end = $start;
5452                         }
5453
5454                         # The values for some of these tables are stored as
5455                         # hex strings.  Convert those to decimal
5456                         $value = hex($value)
5457                                     if $self->default_map eq $CODE_POINT
5458                                         && $value =~ / ^ [A-Fa-f0-9]+ $ /x;
5459
5460                         # If this range is adjacent to the previous one, and
5461                         # the values in each are integers that are also
5462                         # adjacent (differ by 1), then this range really
5463                         # extends the previous one that is already in element
5464                         # $OUT[-1].  So we pop that element, and pretend that
5465                         # the range starts with whatever it started with.
5466                         # $offset is incremented by 1 each time so that it
5467                         # gives the current offset from the first element in
5468                         # the accumulating range, and we keep in $value the
5469                         # value of that first element.
5470                         if ($start == $previous_end + 1
5471                             && $value =~ /^ -? \d+ $/xa
5472                             && $previous_value =~ /^ -? \d+ $/xa
5473                             && ($value == ($previous_value + ++$offset)))
5474                         {
5475                             pop @OUT;
5476                             $start = $previous_start;
5477                             $value = $previous_value;
5478                         }
5479                         else {
5480                             $offset = 0;
5481                         }
5482
5483                         # Save the current values for the next time through
5484                         # the loop.
5485                         $previous_start = $start;
5486                         $previous_end = $end;
5487                         $previous_value = $value;
5488                     }
5489
5490                     # If there is a range and doesn't need a single point range
5491                     # output
5492                     if ($start != $end && ! $range_size_1) {
5493                         push @OUT, sprintf "%04X\t%04X", $start, $end;
5494                         $OUT[-1] .= "\t$value" if $value ne "";
5495
5496                         # Add a comment with the size of the range, if
5497                         # requested.  Expand Tabs to make sure they all start
5498                         # in the same column, and then unexpand to use mostly
5499                         # tabs.
5500                         if (! $output_range_counts{$addr}) {
5501                             $OUT[-1] .= "\n";
5502                         }
5503                         else {
5504                             $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5505                             my $count = main::clarify_number($end - $start + 1);
5506                             use integer;
5507
5508                             my $width = $tab_stops * 8 - 1;
5509                             $OUT[-1] = sprintf("%-*s # [%s]\n",
5510                                                 $width,
5511                                                 $OUT[-1],
5512                                                 $count);
5513                             $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5514                         }
5515                     }
5516
5517                         # Here to output a single code point per line.
5518                         # If not to annotate, use the simple formats
5519                     elsif (! $annotate) {
5520
5521                         # Use any passed in subroutine to output.
5522                         if (ref $range_size_1 eq 'CODE') {
5523                             for my $i ($start .. $end) {
5524                                 push @OUT, &{$range_size_1}($i, $value);
5525                             }
5526                         }
5527                         else {
5528
5529                             # Here, caller is ok with default output.
5530                             for (my $i = $start; $i <= $end; $i++) {
5531                                 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5532                             }
5533                         }
5534                     }
5535                     else {
5536
5537                         # Here, wants annotation.
5538                         for (my $i = $start; $i <= $end; $i++) {
5539
5540                             # Get character information if don't have it already
5541                             main::populate_char_info($i)
5542                                                 if ! defined $viacode[$i];
5543                             my $type = $annotate_char_type[$i];
5544
5545                             # Figure out if should output the next code points
5546                             # as part of a range or not.  If this is not in an
5547                             # annotation range, then won't output as a range,
5548                             # so returns $i.  Otherwise use the end of the
5549                             # annotation range, but no further than the
5550                             # maximum possible end point of the loop.
5551                             my $range_end = main::min(
5552                                         $annotate_ranges->value_of($i) || $i,
5553                                         $end);
5554
5555                             # Use a range if it is a range, and either is one
5556                             # of the special annotation ranges, or the range
5557                             # is at most 3 long.  This last case causes the
5558                             # algorithmically named code points to be output
5559                             # individually in spans of at most 3, as they are
5560                             # the ones whose $type is > 0.
5561                             if ($range_end != $i
5562                                 && ( $type < 0 || $range_end - $i > 2))
5563                             {
5564                                 # Here is to output a range.  We don't allow a
5565                                 # caller-specified output format--just use the
5566                                 # standard one.
5567                                 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5568                                                                 $range_end,
5569                                                                 $value;
5570                                 my $range_name = $viacode[$i];
5571
5572                                 # For the code points which end in their hex
5573                                 # value, we eliminate that from the output
5574                                 # annotation, and capitalize only the first
5575                                 # letter of each word.
5576                                 if ($type == $CP_IN_NAME) {
5577                                     my $hex = sprintf "%04X", $i;
5578                                     $range_name =~ s/-$hex$//;
5579                                     my @words = split " ", $range_name;
5580                                     for my $word (@words) {
5581                                         $word =
5582                                           ucfirst(lc($word)) if $word ne 'CJK';
5583                                     }
5584                                     $range_name = join " ", @words;
5585                                 }
5586                                 elsif ($type == $HANGUL_SYLLABLE) {
5587                                     $range_name = "Hangul Syllable";
5588                                 }
5589
5590                                 $OUT[-1] .= " $range_name" if $range_name;
5591
5592                                 # Include the number of code points in the
5593                                 # range
5594                                 my $count =
5595                                     main::clarify_number($range_end - $i + 1);
5596                                 $OUT[-1] .= " [$count]\n";
5597
5598                                 # Skip to the end of the range
5599                                 $i = $range_end;
5600                             }
5601                             else { # Not in a range.
5602                                 my $comment = "";
5603
5604                                 # When outputting the names of each character,
5605                                 # use the character itself if printable
5606                                 $comment .= "'" . chr($i) . "' "
5607                                                             if $printable[$i];
5608
5609                                 # To make it more readable, use a minimum
5610                                 # indentation
5611                                 my $comment_indent;
5612
5613                                 # Determine the annotation
5614                                 if ($format eq $DECOMP_STRING_FORMAT) {
5615
5616                                     # This is very specialized, with the type
5617                                     # of decomposition beginning the line
5618                                     # enclosed in <...>, and the code points
5619                                     # that the code point decomposes to
5620                                     # separated by blanks.  Create two
5621                                     # strings, one of the printable
5622                                     # characters, and one of their official
5623                                     # names.
5624                                     (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5625                                     my $tostr = "";
5626                                     my $to_name = "";
5627                                     my $to_chr = "";
5628                                     foreach my $to (split " ", $map) {
5629                                         $to = CORE::hex $to;
5630                                         $to_name .= " + " if $to_name;
5631                                         $to_chr .= chr($to);
5632                                         main::populate_char_info($to)
5633                                                     if ! defined $viacode[$to];
5634                                         $to_name .=  $viacode[$to];
5635                                     }
5636
5637                                     $comment .=
5638                                     "=> '$to_chr'; $viacode[$i] => $to_name";
5639                                     $comment_indent = 25;   # Determined by
5640                                                             # experiment
5641                                 }
5642                                 else {
5643
5644                                     # Assume that any table that has hex
5645                                     # format is a mapping of one code point to
5646                                     # another.
5647                                     if ($format eq $HEX_FORMAT) {
5648                                         my $decimal_value = CORE::hex $value;
5649                                         main::populate_char_info($decimal_value)
5650                                         if ! defined $viacode[$decimal_value];
5651                                         $comment .= "=> '"
5652                                         . chr($decimal_value)
5653                                         . "'; " if $printable[$decimal_value];
5654                                     }
5655                                     $comment .= $viacode[$i] if $include_name
5656                                                             && $viacode[$i];
5657                                     if ($format eq $HEX_FORMAT) {
5658                                         my $decimal_value = CORE::hex $value;
5659                                         $comment .=
5660                                             " => $viacode[$decimal_value]"
5661                                                 if $viacode[$decimal_value];
5662                                     }
5663
5664                                     # If including the name, no need to
5665                                     # indent, as the name will already be way
5666                                     # across the line.
5667                                     $comment_indent = ($include_name) ? 0 : 60;
5668                                 }
5669
5670                                 # Use any passed in routine to output the base
5671                                 # part of the line.
5672                                 if (ref $range_size_1 eq 'CODE') {
5673                                     my $base_part=&{$range_size_1}($i, $value);
5674                                     chomp $base_part;
5675                                     push @OUT, $base_part;
5676                                 }
5677                                 else {
5678                                     push @OUT, sprintf "%04X\t\t%s", $i, $value;
5679                                 }
5680
5681                                 # And add the annotation.
5682                                 $OUT[-1] = sprintf "%-*s\t# %s",
5683                                                    $comment_indent,
5684                                                    $OUT[-1],
5685                                                    $comment
5686                                             if $comment;
5687                                 $OUT[-1] .= "\n";
5688                             }
5689                         }
5690                     }
5691
5692                     # If we split the range, set up so the next time through
5693                     # we get the remainder, and redo.
5694                     if ($next_start) {
5695                         $start = $next_start;
5696                         $end = $next_end;
5697                         $value = $next_value;
5698                         $next_start = 0;
5699                         redo;
5700                     }
5701                 }
5702             } # End of loop through all the table's ranges
5703         }
5704
5705         # Add anything that goes after the main body, but within the here
5706         # document,
5707         my $append_to_body = $self->append_to_body;
5708         push @OUT, $append_to_body if $append_to_body;
5709
5710         # And finish the here document.
5711         push @OUT, "END\n";
5712
5713         # Done with the main portion of the body.  Can now figure out what
5714         # should appear before it in the file.
5715         my $pre_body = $self->pre_body;
5716         push @HEADER, $pre_body, "\n" if $pre_body;
5717
5718         # All these files should have a .pl suffix added to them.
5719         my @file_with_pl = @{$file_path{$addr}};
5720         $file_with_pl[-1] .= '.pl';
5721
5722         main::write(\@file_with_pl,
5723                     $annotate,      # utf8 iff annotating
5724                     \@HEADER,
5725                     \@OUT);
5726         return;
5727     }
5728
5729     sub set_status {    # Set the table's status
5730         my $self = shift;
5731         my $status = shift; # The status enum value
5732         my $info = shift;   # Any message associated with it.
5733         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5734
5735         my $addr = do { no overloading; pack 'J', $self; };
5736
5737         $status{$addr} = $status;
5738         $status_info{$addr} = $info;
5739         return;
5740     }
5741
5742     sub set_fate {  # Set the fate of a table
5743         my $self = shift;
5744         my $fate = shift;
5745         my $reason = shift;
5746         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5747
5748         my $addr = do { no overloading; pack 'J', $self; };
5749
5750         return if $fate{$addr} == $fate;    # If no-op
5751
5752         # Can only change the ordinary fate, except if going to $MAP_PROXIED
5753         return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
5754
5755         $fate{$addr} = $fate;
5756
5757         # Don't document anything to do with a non-normal fated table
5758         if ($fate != $ORDINARY) {
5759             my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
5760             foreach my $alias ($self->aliases) {
5761                 $alias->set_ucd($put_in_pod);
5762
5763                 # MAP_PROXIED doesn't affect the match tables
5764                 next if $fate == $MAP_PROXIED;
5765                 $alias->set_make_re_pod_entry($put_in_pod);
5766             }
5767         }
5768
5769         # Save the reason for suppression for output
5770         if ($fate == $SUPPRESSED && defined $reason) {
5771             $why_suppressed{$complete_name{$addr}} = $reason;
5772         }
5773
5774         return;
5775     }
5776
5777     sub lock {
5778         # Don't allow changes to the table from now on.  This stores a stack
5779         # trace of where it was called, so that later attempts to modify it
5780         # can immediately show where it got locked.
5781
5782         my $self = shift;
5783         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5784
5785         my $addr = do { no overloading; pack 'J', $self; };
5786
5787         $locked{$addr} = "";
5788
5789         my $line = (caller(0))[2];
5790         my $i = 1;
5791
5792         # Accumulate the stack trace
5793         while (1) {
5794             my ($pkg, $file, $caller_line, $caller) = caller $i++;
5795
5796             last unless defined $caller;
5797
5798             $locked{$addr} .= "    called from $caller() at line $line\n";
5799             $line = $caller_line;
5800         }
5801         $locked{$addr} .= "    called from main at line $line\n";
5802
5803         return;
5804     }
5805
5806     sub carp_if_locked {
5807         # Return whether a table is locked or not, and, by the way, complain
5808         # if is locked
5809
5810         my $self = shift;
5811         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5812
5813         my $addr = do { no overloading; pack 'J', $self; };
5814
5815         return 0 if ! $locked{$addr};
5816         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5817         return 1;
5818     }
5819
5820     sub set_file_path { # Set the final directory path for this table
5821         my $self = shift;
5822         # Rest of parameters passed on
5823
5824         no overloading;
5825         @{$file_path{pack 'J', $self}} = @_;
5826         return
5827     }
5828
5829     # Accessors for the range list stored in this table.  First for
5830     # unconditional
5831     for my $sub (qw(
5832                     containing_range
5833                     contains
5834                     count
5835                     each_range
5836                     hash
5837                     is_empty
5838                     matches_identically_to
5839                     max
5840                     min
5841                     range_count
5842                     reset_each_range
5843                     type_of
5844                     value_of
5845                 ))
5846     {
5847         no strict "refs";
5848         *$sub = sub {
5849             use strict "refs";
5850             my $self = shift;
5851             return $self->_range_list->$sub(@_);
5852         }
5853     }
5854
5855     # Then for ones that should fail if locked
5856     for my $sub (qw(
5857                     delete_range
5858                 ))
5859     {
5860         no strict "refs";
5861         *$sub = sub {
5862             use strict "refs";
5863             my $self = shift;
5864
5865             return if $self->carp_if_locked;
5866             no overloading;
5867             return $self->_range_list->$sub(@_);
5868         }
5869     }
5870
5871 } # End closure
5872
5873 package Map_Table;
5874 use base '_Base_Table';
5875
5876 # A Map Table is a table that contains the mappings from code points to
5877 # values.  There are two weird cases:
5878 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5879 #    are written in the table's file at the end of the table nonetheless.  It
5880 #    requires specially constructed code to handle these; utf8.c can not read
5881 #    these in, so they should not go in $map_directory.  As of this writing,
5882 #    the only case that these happen is for named sequences used in
5883 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
5884 #    something else could come along that uses it.
5885 # 2) Specials are anything that doesn't fit syntactically into the body of the
5886 #    table.  The ranges for these have a map type of non-zero.  The code below
5887 #    knows about and handles each possible type.   In most cases, these are
5888 #    written as part of the header.
5889 #
5890 # A map table deliberately can't be manipulated at will unlike match tables.
5891 # This is because of the ambiguities having to do with what to do with
5892 # overlapping code points.  And there just isn't a need for those things;
5893 # what one wants to do is just query, add, replace, or delete mappings, plus
5894 # write the final result.
5895 # However, there is a method to get the list of possible ranges that aren't in
5896 # this table to use for defaulting missing code point mappings.  And,
5897 # map_add_or_replace_non_nulls() does allow one to add another table to this
5898 # one, but it is clearly very specialized, and defined that the other's
5899 # non-null values replace this one's if there is any overlap.
5900
5901 sub trace { return main::trace(@_); }
5902
5903 { # Closure
5904
5905     main::setup_package();
5906
5907     my %default_map;
5908     # Many input files omit some entries; this gives what the mapping for the
5909     # missing entries should be
5910     main::set_access('default_map', \%default_map, 'r');
5911
5912     my %anomalous_entries;
5913     # Things that go in the body of the table which don't fit the normal
5914     # scheme of things, like having a range.  Not much can be done with these
5915     # once there except to output them.  This was created to handle named
5916     # sequences.
5917     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5918     main::set_access('anomalous_entries',       # Append singular, read plural
5919                     \%anomalous_entries,
5920                     'readable_array');
5921
5922     my %to_output_map;
5923     # Enum as to whether or not to write out this map table, and how:
5924     #   0               don't output
5925     #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
5926     #                   it should not be removed nor its format changed.  This
5927     #                   is done for those files that have traditionally been
5928     #                   output.
5929     #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
5930     #                   with this file
5931     #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
5932     #                   outputting the actual mappings as-is, we adjust things
5933     #                   to create a much more compact table. Only those few
5934     #                   tables where the mapping is convertible at least to an
5935     #                   integer and compacting makes a big difference should
5936     #                   have this.  Hence, the default is to not do this
5937     #                   unless the table's default mapping is to $CODE_POINT,
5938     #                   and the range size is not 1.
5939     main::set_access('to_output_map', \%to_output_map, 's');
5940
5941     sub new {
5942         my $class = shift;
5943         my $name = shift;
5944
5945         my %args = @_;
5946
5947         # Optional initialization data for the table.
5948         my $initialize = delete $args{'Initialize'};
5949
5950         my $default_map = delete $args{'Default_Map'};
5951         my $property = delete $args{'_Property'};
5952         my $full_name = delete $args{'Full_Name'};
5953         my $to_output_map = delete $args{'To_Output_Map'};
5954
5955         # Rest of parameters passed on
5956
5957         my $range_list = Range_Map->new(Owner => $property);
5958
5959         my $self = $class->SUPER::new(
5960                                     Name => $name,
5961                                     Complete_Name =>  $full_name,
5962                                     Full_Name => $full_name,
5963                                     _Property => $property,
5964                                     _Range_List => $range_list,
5965                                     %args);
5966
5967         my $addr = do { no overloading; pack 'J', $self; };
5968
5969         $anomalous_entries{$addr} = [];
5970         $default_map{$addr} = $default_map;
5971         $to_output_map{$addr} = $to_output_map;
5972
5973         $self->initialize($initialize) if defined $initialize;
5974
5975         return $self;
5976     }
5977
5978     use overload
5979         fallback => 0,
5980         qw("") => "_operator_stringify",
5981     ;
5982
5983     sub _operator_stringify {
5984         my $self = shift;
5985
5986         my $name = $self->property->full_name;
5987         $name = '""' if $name eq "";
5988         return "Map table for Property '$name'";
5989     }
5990
5991     sub add_alias {
5992         # Add a synonym for this table (which means the property itself)
5993         my $self = shift;
5994         my $name = shift;
5995         # Rest of parameters passed on.
5996
5997         $self->SUPER::add_alias($name, $self->property, @_);
5998         return;
5999     }
6000
6001     sub add_map {
6002         # Add a range of code points to the list of specially-handled code
6003         # points.  $MULTI_CP is assumed if the type of special is not passed
6004         # in.
6005
6006         my $self = shift;
6007         my $lower = shift;
6008         my $upper = shift;
6009         my $string = shift;
6010         my %args = @_;
6011
6012         my $type = delete $args{'Type'} || 0;
6013         # Rest of parameters passed on
6014
6015         # Can't change the table if locked.
6016         return if $self->carp_if_locked;
6017
6018         my $addr = do { no overloading; pack 'J', $self; };
6019
6020         $self->_range_list->add_map($lower, $upper,
6021                                     $string,
6022                                     @_,
6023                                     Type => $type);
6024         return;
6025     }
6026
6027     sub append_to_body {
6028         # Adds to the written HERE document of the table's body any anomalous
6029         # entries in the table..
6030
6031         my $self = shift;
6032         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6033
6034         my $addr = do { no overloading; pack 'J', $self; };
6035
6036         return "" unless @{$anomalous_entries{$addr}};
6037         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6038     }
6039
6040     sub map_add_or_replace_non_nulls {
6041         # This adds the mappings in the table $other to $self.  Non-null
6042         # mappings from $other override those in $self.  It essentially merges
6043         # the two tables, with the second having priority except for null
6044         # mappings.
6045
6046         my $self = shift;
6047         my $other = shift;
6048         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6049
6050         return if $self->carp_if_locked;
6051
6052         if (! $other->isa(__PACKAGE__)) {
6053             Carp::my_carp_bug("$other should be a "
6054                         . __PACKAGE__
6055                         . ".  Not a '"
6056                         . ref($other)
6057                         . "'.  Not added;");
6058             return;
6059         }
6060
6061         my $addr = do { no overloading; pack 'J', $self; };
6062         my $other_addr = do { no overloading; pack 'J', $other; };
6063
6064         local $to_trace = 0 if main::DEBUG;
6065
6066         my $self_range_list = $self->_range_list;
6067         my $other_range_list = $other->_range_list;
6068         foreach my $range ($other_range_list->ranges) {
6069             my $value = $range->value;
6070             next if $value eq "";
6071             $self_range_list->_add_delete('+',
6072                                           $range->start,
6073                                           $range->end,
6074                                           $value,
6075                                           Type => $range->type,
6076                                           Replace => $UNCONDITIONALLY);
6077         }
6078
6079         return;
6080     }
6081
6082     sub set_default_map {
6083         # Define what code points that are missing from the input files should
6084         # map to
6085
6086         my $self = shift;
6087         my $map = shift;
6088         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6089
6090         my $addr = do { no overloading; pack 'J', $self; };
6091
6092         # Convert the input to the standard equivalent, if any (won't have any
6093         # for $STRING properties)
6094         my $standard = $self->_find_table_from_alias->{$map};
6095         $map = $standard->name if defined $standard;
6096
6097         # Warn if there already is a non-equivalent default map for this
6098         # property.  Note that a default map can be a ref, which means that
6099         # what it actually means is delayed until later in the program, and it
6100         # IS permissible to override it here without a message.
6101         my $default_map = $default_map{$addr};
6102         if (defined $default_map
6103             && ! ref($default_map)
6104             && $default_map ne $map
6105             && main::Standardize($map) ne $default_map)
6106         {
6107             my $property = $self->property;
6108             my $map_table = $property->table($map);
6109             my $default_table = $property->table($default_map);
6110             if (defined $map_table
6111                 && defined $default_table
6112                 && $map_table != $default_table)
6113             {
6114                 Carp::my_carp("Changing the default mapping for "
6115                             . $property
6116                             . " from $default_map to $map'");
6117             }
6118         }
6119
6120         $default_map{$addr} = $map;
6121
6122         # Don't also create any missing table for this map at this point,
6123         # because if we did, it could get done before the main table add is
6124         # done for PropValueAliases.txt; instead the caller will have to make
6125         # sure it exists, if desired.
6126         return;
6127     }
6128
6129     sub to_output_map {
6130         # Returns boolean: should we write this map table?
6131
6132         my $self = shift;
6133         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6134
6135         my $addr = do { no overloading; pack 'J', $self; };
6136
6137         # If overridden, use that
6138         return $to_output_map{$addr} if defined $to_output_map{$addr};
6139
6140         my $full_name = $self->full_name;
6141         return $global_to_output_map{$full_name}
6142                                 if defined $global_to_output_map{$full_name};
6143
6144         # If table says to output, do so; if says to suppress it, do so.
6145         my $fate = $self->fate;
6146         return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6147         return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6148         return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6149
6150         my $type = $self->property->type;
6151
6152         # Don't want to output binary map tables even for debugging.
6153         return 0 if $type == $BINARY;
6154
6155         # But do want to output string ones.  All the ones that remain to
6156         # be dealt with (i.e. which haven't explicitly been set to external)
6157         # are for internal Perl use only.  The default for those that map to
6158         # $CODE_POINT and haven't been restricted to a single element range
6159         # is to use the adjusted form.
6160         if ($type == $STRING) {
6161             return $INTERNAL_MAP if $self->range_size_1
6162                                     || $default_map{$addr} ne $CODE_POINT;
6163             return $OUTPUT_ADJUSTED;
6164         }
6165
6166         # Otherwise is an $ENUM, do output it, for Perl's purposes
6167         return $INTERNAL_MAP;
6168     }
6169
6170     sub inverse_list {
6171         # Returns a Range_List that is gaps of the current table.  That is,
6172         # the inversion
6173
6174         my $self = shift;
6175         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6176
6177         my $current = Range_List->new(Initialize => $self->_range_list,
6178                                 Owner => $self->property);
6179         return ~ $current;
6180     }
6181
6182     sub header {
6183         my $self = shift;
6184         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6185
6186         my $return = $self->SUPER::header();
6187
6188         if ($self->to_output_map >= $INTERNAL_MAP) {
6189             $return .= $INTERNAL_ONLY_HEADER;
6190         }
6191         else {
6192             my $property_name = $self->property->full_name =~ s/Legacy_//r;
6193             $return .= <<END;
6194
6195 # !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
6196
6197 # This file is for internal use by core Perl only.  It is retained for
6198 # backwards compatibility with applications that may have come to rely on it,
6199 # but its format and even its name or existence are subject to change without
6200 # notice in a future Perl version.  Don't use it directly.  Instead, its
6201 # contents are now retrievable through a stable API in the Unicode::UCD
6202 # module: Unicode::UCD::prop_invmap('$property_name').
6203 END
6204         }
6205         return $return;
6206     }
6207
6208     sub set_final_comment {
6209         # Just before output, create the comment that heads the file
6210         # containing this table.
6211
6212         return unless $debugging_build;
6213
6214         my $self = shift;
6215         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6216
6217         # No sense generating a comment if aren't going to write it out.
6218         return if ! $self->to_output_map;
6219
6220         my $addr = do { no overloading; pack 'J', $self; };
6221
6222         my $property = $self->property;
6223
6224         # Get all the possible names for this property.  Don't use any that
6225         # aren't ok for use in a file name, etc.  This is perhaps causing that
6226         # flag to do double duty, and may have to be changed in the future to
6227         # have our own flag for just this purpose; but it works now to exclude
6228         # Perl generated synonyms from the lists for properties, where the
6229         # name is always the proper Unicode one.
6230         my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6231
6232         my $count = $self->count;
6233         my $default_map = $default_map{$addr};
6234
6235         # The ranges that map to the default aren't output, so subtract that
6236         # to get those actually output.  A property with matching tables
6237         # already has the information calculated.
6238         if ($property->type != $STRING) {
6239             $count -= $property->table($default_map)->count;
6240         }
6241         elsif (defined $default_map) {
6242
6243             # But for $STRING properties, must calculate now.  Subtract the
6244             # count from each range that maps to the default.
6245             foreach my $range ($self->_range_list->ranges) {
6246                 if ($range->value eq $default_map) {
6247                     $count -= $range->end +1 - $range->start;
6248                 }
6249             }
6250
6251         }
6252
6253         # Get a  string version of $count with underscores in large numbers,
6254         # for clarity.
6255         my $string_count = main::clarify_number($count);
6256
6257         my $code_points = ($count == 1)
6258                         ? 'single code point'
6259                         : "$string_count code points";
6260
6261         my $mapping;
6262         my $these_mappings;
6263         my $are;
6264         if (@property_aliases <= 1) {
6265             $mapping = 'mapping';
6266             $these_mappings = 'this mapping';
6267             $are = 'is'
6268         }
6269         else {
6270             $mapping = 'synonymous mappings';
6271             $these_mappings = 'these mappings';
6272             $are = 'are'
6273         }
6274         my $cp;
6275         if ($count >= $MAX_UNICODE_CODEPOINTS) {
6276             $cp = "any code point in Unicode Version $string_version";
6277         }
6278         else {
6279             my $map_to;
6280             if ($default_map eq "") {
6281                 $map_to = 'the null string';
6282             }
6283             elsif ($default_map eq $CODE_POINT) {
6284                 $map_to = "itself";
6285             }
6286             else {
6287                 $map_to = "'$default_map'";
6288             }
6289             if ($count == 1) {
6290                 $cp = "the single code point";
6291             }
6292             else {
6293                 $cp = "one of the $code_points";
6294             }
6295             $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6296         }
6297
6298         my $comment = "";
6299
6300         my $status = $self->status;
6301         if ($status && $status ne $PLACEHOLDER) {
6302             my $warn = uc $status_past_participles{$status};
6303             $comment .= <<END;
6304
6305 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
6306  All property or property=value combinations contained in this file are $warn.
6307  See $unicode_reference_url for what this means.
6308
6309 END
6310         }
6311         $comment .= "This file returns the $mapping:\n";
6312
6313         my $ucd_accessible_name = "";
6314         my $full_name = $self->property->full_name;
6315         for my $i (0 .. @property_aliases - 1) {
6316             my $name = $property_aliases[$i]->name;
6317             $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
6318             if ($property_aliases[$i]->ucd) {
6319                 if ($name eq $full_name) {
6320                     $ucd_accessible_name = $full_name;
6321                 }
6322                 elsif (! $ucd_accessible_name) {
6323                     $ucd_accessible_name = $name;
6324                 }
6325             }
6326         }
6327         $comment .= "\nwhere 'cp' is $cp.";
6328         if ($ucd_accessible_name) {
6329             $comment .= "  Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
6330         }
6331
6332         # And append any commentary already set from the actual property.
6333         $comment .= "\n\n" . $self->comment if $self->comment;
6334         if ($self->description) {
6335             $comment .= "\n\n" . join " ", $self->description;
6336         }
6337         if ($self->note) {
6338             $comment .= "\n\n" . join " ", $self->note;
6339         }
6340         $comment .= "\n";
6341
6342         if (! $self->perl_extension) {
6343             $comment .= <<END;
6344
6345 For information about what this property really means, see:
6346 $unicode_reference_url
6347 END
6348         }
6349
6350         if ($count) {        # Format differs for empty table
6351                 $comment.= "\nThe format of the ";
6352             if ($self->range_size_1) {
6353                 $comment.= <<END;
6354 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6355 is in hex; MAPPING is what CODE_POINT maps to.
6356 END
6357             }
6358             else {
6359
6360                 # There are tables which end up only having one element per
6361                 # range, but it is not worth keeping track of for making just
6362                 # this comment a little better.
6363                 $comment.= <<END;
6364 non-comment portions of the main body of lines of this file is:
6365 START\\tSTOP\\tMAPPING where START is the starting code point of the
6366 range, in hex; STOP is the ending point, or if omitted, the range has just one
6367 code point; MAPPING is what each code point between START and STOP maps to.
6368 END
6369                 if ($self->output_range_counts) {
6370                     $comment .= <<END;
6371 Numbers in comments in [brackets] indicate how many code points are in the
6372 range (omitted when the range is a single code point or if the mapping is to
6373 the null string).
6374 END
6375                 }
6376             }
6377         }
6378         $self->set_comment(main::join_lines($comment));
6379         return;
6380     }
6381
6382     my %swash_keys; # Makes sure don't duplicate swash names.
6383
6384     # The remaining variables are temporaries used while writing each table,
6385     # to output special ranges.
6386     my @multi_code_point_maps;  # Map is to more than one code point.
6387
6388     sub handle_special_range {
6389         # Called in the middle of write when it finds a range it doesn't know
6390         # how to handle.
6391
6392         my $self = shift;
6393         my $range = shift;
6394         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6395
6396         my $addr = do { no overloading; pack 'J', $self; };
6397
6398         my $type = $range->type;
6399
6400         my $low = $range->start;
6401         my $high = $range->end;
6402         my $map = $range->value;
6403
6404         # No need to output the range if it maps to the default.
6405         return if $map eq $default_map{$addr};
6406
6407         my $property = $self->property;
6408
6409         # Switch based on the map type...
6410         if ($type == $HANGUL_SYLLABLE) {
6411
6412             # These are entirely algorithmically determinable based on
6413             # some constants furnished by Unicode; for now, just set a
6414             # flag to indicate that have them.  After everything is figured
6415             # out, we will output the code that does the algorithm.  (Don't
6416             # output them if not needed because we are suppressing this
6417             # property.)
6418             $has_hangul_syllables = 1 if $property->to_output_map;
6419         }
6420         elsif ($type == $CP_IN_NAME) {
6421
6422             # Code points whose name ends in their code point are also
6423             # algorithmically determinable, but need information about the map
6424             # to do so.  Both the map and its inverse are stored in data
6425             # structures output in the file.  They are stored in the mean time
6426             # in global lists The lists will be written out later into Name.pm,
6427             # which is created only if needed.  In order to prevent duplicates
6428             # in the list, only add to them for one property, should multiple
6429             # ones need them.
6430             if ($needing_code_points_ending_in_code_point == 0) {
6431                 $needing_code_points_ending_in_code_point = $property;
6432             }
6433             if ($property == $needing_code_points_ending_in_code_point) {
6434                 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6435                 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6436
6437                 my $squeezed = $map =~ s/[-\s]+//gr;
6438                 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6439                                                                           $low;
6440                 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6441                                                                          $high;
6442
6443                 push @code_points_ending_in_code_point, { low => $low,
6444                                                         high => $high,
6445                                                         name => $map
6446                                                         };
6447             }
6448         }
6449         elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6450
6451             # Multi-code point maps and null string maps have an entry
6452             # for each code point in the range.  They use the same
6453             # output format.
6454             for my $code_point ($low .. $high) {
6455
6456                 # The pack() below can't cope with surrogates.  XXX This may
6457                 # no longer be true
6458                 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
6459                     Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
6460                     next;
6461                 }
6462
6463                 # Generate the hash entries for these in the form that
6464                 # utf8.c understands.
6465                 my $tostr = "";
6466                 my $to_name = "";
6467                 my $to_chr = "";
6468                 foreach my $to (split " ", $map) {
6469                     if ($to !~ /^$code_point_re$/) {
6470                         Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
6471                         next;
6472                     }
6473                     $tostr .= sprintf "\\x{%s}", $to;
6474                     $to = CORE::hex $to;
6475                     if ($annotate) {
6476                         $to_name .= " + " if $to_name;
6477                         $to_chr .= chr($to);
6478                         main::populate_char_info($to)
6479                                             if ! defined $viacode[$to];
6480                         $to_name .=  $viacode[$to];
6481                     }
6482                 }
6483
6484                 # I (khw) have never waded through this line to
6485                 # understand it well enough to comment it.
6486                 my $utf8 = sprintf(qq["%s" => "$tostr",],
6487                         join("", map { sprintf "\\x%02X", $_ }
6488                             unpack("U0C*", pack("U", $code_point))));
6489
6490                 # Add a comment so that a human reader can more easily
6491                 # see what's going on.
6492                 push @multi_code_point_maps,
6493                         sprintf("%-45s # U+%04X", $utf8, $code_point);
6494                 if (! $annotate) {
6495                     $multi_code_point_maps[-1] .= " => $map";
6496                 }
6497                 else {
6498                     main::populate_char_info($code_point)
6499                                     if ! defined $viacode[$code_point];
6500                     $multi_code_point_maps[-1] .= " '"
6501                         . chr($code_point)
6502                         . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6503                 }
6504             }
6505         }
6506         else {
6507             Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
6508         }
6509
6510         return;
6511     }
6512
6513     sub pre_body {
6514         # Returns the string that should be output in the file before the main
6515         # body of this table.  It isn't called until the main body is
6516         # calculated, saving a pass.  The string includes some hash entries
6517         # identifying the format of the body, and what the single value should
6518         # be for all ranges missing from it.  It also includes any code points
6519         # which have map_types that don't go in the main table.
6520
6521         my $self = shift;
6522         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6523
6524         my $addr = do { no overloading; pack 'J', $self; };
6525
6526         my $name = $self->property->swash_name;
6527
6528         # Currently there is nothing in the pre_body unless a swash is being
6529         # generated.
6530         return unless defined $name;
6531
6532         if (defined $swash_keys{$name}) {
6533             Carp::my_carp(main::join_lines(<<END
6534 Already created a swash name '$name' for $swash_keys{$name}.  This means that
6535 the same name desired for $self shouldn't be used.  Bad News.  This must be
6536 fixed before production use, but proceeding anyway
6537 END
6538             ));
6539         }
6540         $swash_keys{$name} = "$self";
6541
6542         my $pre_body = "";
6543
6544         # Here we assume we were called after have gone through the whole
6545         # file.  If we actually generated anything for each map type, add its
6546         # respective header and trailer
6547         my $specials_name = "";
6548         if (@multi_code_point_maps) {
6549             $specials_name = "utf8::ToSpec$name";
6550             $pre_body .= <<END;
6551
6552 # Some code points require special handling because their mappings are each to
6553 # multiple code points.  These do not appear in the main body, but are defined
6554 # in the hash below.
6555
6556 # Each key is the string of N bytes that together make up the UTF-8 encoding
6557 # for the code point.  (i.e. the same as looking at the code point's UTF-8
6558 # under "use bytes").  Each value is the UTF-8 of the translation, for speed.
6559 \%$specials_name = (
6560 END
6561             $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6562         }
6563
6564         my $format = $self->format;
6565
6566         my $return = "";
6567
6568         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6569         if ($output_adjusted) {
6570             if ($specials_name) {
6571                 $return .= <<END;
6572 # The mappings in the non-hash portion of this file must be modified to get the
6573 # correct values by adding the code point ordinal number to each one that is
6574 # numeric.
6575 END
6576             }
6577             else {
6578                 $return .= <<END;
6579 # The mappings must be modified to get the correct values by adding the code
6580 # point ordinal number to each one that is numeric.
6581 END
6582             }
6583         }
6584
6585         $return .= <<END;
6586
6587 # The name this swash is to be known by, with the format of the mappings in
6588 # the main body of the table, and what all code points missing from this file
6589 # map to.
6590 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6591 END
6592         if ($specials_name) {
6593             $return .= <<END;
6594 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6595 END
6596         }
6597         my $default_map = $default_map{$addr};
6598
6599         # For $CODE_POINT default maps and using adjustments, instead the default
6600         # becomes zero.
6601         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
6602                 .  (($output_adjusted && $default_map eq $CODE_POINT)
6603                    ? "0"
6604                    : $default_map)
6605                 . "';";
6606
6607         if ($default_map eq $CODE_POINT) {
6608             $return .= ' # code point maps to itself';
6609         }
6610         elsif ($default_map eq "") {
6611             $return .= ' # code point maps to the null string';
6612         }
6613         $return .= "\n";
6614
6615         $return .= $pre_body;
6616
6617         return $return;
6618     }
6619
6620     sub write {
6621         # Write the table to the file.
6622
6623         my $self = shift;
6624         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6625
6626         my $addr = do { no overloading; pack 'J', $self; };
6627
6628         # Clear the temporaries
6629         undef @multi_code_point_maps;
6630
6631         # Calculate the format of the table if not already done.
6632         my $format = $self->format;
6633         my $type = $self->property->type;
6634         my $default_map = $self->default_map;
6635         if (! defined $format) {
6636             if ($type == $BINARY) {
6637
6638                 # Don't bother checking the values, because we elsewhere
6639                 # verify that a binary table has only 2 values.
6640                 $format = $BINARY_FORMAT;
6641             }
6642             else {
6643                 my @ranges = $self->_range_list->ranges;
6644
6645                 # default an empty table based on its type and default map
6646                 if (! @ranges) {
6647
6648                     # But it turns out that the only one we can say is a
6649                     # non-string (besides binary, handled above) is when the
6650                     # table is a string and the default map is to a code point
6651                     if ($type == $STRING && $default_map eq $CODE_POINT) {
6652                         $format = $HEX_FORMAT;
6653                     }
6654                     else {
6655                         $format = $STRING_FORMAT;
6656                     }
6657                 }
6658                 else {
6659
6660                     # Start with the most restrictive format, and as we find
6661                     # something that doesn't fit with that, change to the next
6662                     # most restrictive, and so on.
6663                     $format = $DECIMAL_FORMAT;
6664                     foreach my $range (@ranges) {
6665                         next if $range->type != 0;  # Non-normal ranges don't
6666                                                     # affect the main body
6667                         my $map = $range->value;
6668                         if ($map ne $default_map) {
6669                             last if $format eq $STRING_FORMAT;  # already at
6670                                                                 # least
6671                                                                 # restrictive
6672                             $format = $INTEGER_FORMAT
6673                                                 if $format eq $DECIMAL_FORMAT
6674                                                     && $map !~ / ^ [0-9] $ /x;
6675                             $format = $FLOAT_FORMAT
6676                                             if $format eq $INTEGER_FORMAT
6677                                                 && $map !~ / ^ -? [0-9]+ $ /x;
6678                             $format = $RATIONAL_FORMAT
6679                                 if $format eq $FLOAT_FORMAT
6680                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6681                             $format = $HEX_FORMAT
6682                                 if ($format eq $RATIONAL_FORMAT
6683                                        && $map !~
6684                                            m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
6685                                         # Assume a leading zero means hex,
6686                                         # even if all digits are 0-9
6687                                     || ($format eq $INTEGER_FORMAT
6688                                         && $map =~ /^0[0-9A-F]/);
6689                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6690                                                        && $map =~ /[^0-9A-F]/;
6691                         }
6692                     }
6693                 }
6694             }
6695         } # end of calculating format
6696
6697         if ($default_map eq $CODE_POINT
6698             && $format ne $HEX_FORMAT
6699             && ! defined $self->format)    # manual settings are always
6700                                            # considered ok
6701         {
6702             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6703         }
6704
6705         # If the output is to be adjusted, the format of the table that gets
6706         # output is actually 'a' instead of whatever it is stored internally
6707         # as.
6708         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6709         if ($output_adjusted) {
6710             $format = $ADJUST_FORMAT;
6711         }
6712
6713         $self->_set_format($format);
6714
6715         return $self->SUPER::write(
6716             $output_adjusted,
6717             ($self->property == $block)
6718                 ? 7     # block file needs more tab stops
6719                 : 3,
6720             $default_map);   # don't write defaulteds
6721     }
6722
6723     # Accessors for the underlying list that should fail if locked.
6724     for my $sub (qw(
6725                     add_duplicate
6726                 ))
6727     {
6728         no strict "refs";
6729         *$sub = sub {
6730             use strict "refs";
6731             my $self = shift;
6732
6733             return if $self->carp_if_locked;
6734             return $self->_range_list->$sub(@_);
6735         }
6736     }
6737 } # End closure for Map_Table
6738
6739 package Match_Table;
6740 use base '_Base_Table';
6741
6742 # A Match table is one which is a list of all the code points that have
6743 # the same property and property value, for use in \p{property=value}
6744 # constructs in regular expressions.  It adds very little data to the base
6745 # structure, but many methods, as these lists can be combined in many ways to
6746 # form new ones.
6747 # There are only a few concepts added:
6748 # 1) Equivalents and Relatedness.
6749 #    Two tables can match the identical code points, but have different names.
6750 #    This always happens when there is a perl single form extension
6751 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
6752 #    tables are set to be related, with the Perl extension being a child, and
6753 #    the Unicode property being the parent.
6754 #
6755 #    It may be that two tables match the identical code points and we don't
6756 #    know if they are related or not.  This happens most frequently when the
6757 #    Block and Script properties have the exact range.  But note that a
6758 #    revision to Unicode could add new code points to the script, which would
6759 #    now have to be in a different block (as the block was filled, or there
6760 #    would have been 'Unknown' script code points in it and they wouldn't have
6761 #    been identical).  So we can't rely on any two properties from Unicode
6762 #    always matching the same code points from release to release, and thus
6763 #    these tables are considered coincidentally equivalent--not related.  When
6764 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
6765 #    'leader', and the others are 'equivalents'.  This concept is useful
6766 #    to minimize the number of tables written out.  Only one file is used for
6767 #    any identical set of code points, with entries in Heavy.pl mapping all
6768 #    the involved tables to it.
6769 #
6770 #    Related tables will always be identical; we set them up to be so.  Thus
6771 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
6772 #    unrelated tables.  Relatedness makes generating the documentation easier.
6773 #
6774 # 2) Complement.
6775 #    Like equivalents, two tables may be the inverses of each other, the
6776 #    intersection between them is null, and the union is every Unicode code
6777 #    point.  The two tables that occupy a binary property are necessarily like
6778 #    this.  By specifying one table as the complement of another, we can avoid
6779 #    storing it on disk (using the other table and performing a fast
6780 #    transform), and some memory and calculations.
6781 #
6782 # 3) Conflicting.  It may be that there will eventually be name clashes, with
6783 #    the same name meaning different things.  For a while, there actually were
6784 #    conflicts, but they have so far been resolved by changing Perl's or
6785 #    Unicode's definitions to match the other, but when this code was written,
6786 #    it wasn't clear that that was what was going to happen.  (Unicode changed
6787 #    because of protests during their beta period.)  Name clashes are warned
6788 #    about during compilation, and the documentation.  The generated tables
6789 #    are sane, free of name clashes, because the code suppresses the Perl
6790 #    version.  But manual intervention to decide what the actual behavior
6791 #    should be may be required should this happen.  The introductory comments
6792 #    have more to say about this.
6793
6794 sub standardize { return main::standardize($_[0]); }
6795 sub trace { return main::trace(@_); }
6796
6797
6798 { # Closure
6799
6800     main::setup_package();
6801
6802     my %leader;
6803     # The leader table of this one; initially $self.
6804     main::set_access('leader', \%leader, 'r');
6805
6806     my %equivalents;
6807     # An array of any tables that have this one as their leader
6808     main::set_access('equivalents', \%equivalents, 'readable_array');
6809
6810     my %parent;
6811     # The parent table to this one, initially $self.  This allows us to
6812     # distinguish between equivalent tables that are related (for which this
6813     # is set to), and those which may not be, but share the same output file
6814     # because they match the exact same set of code points in the current
6815     # Unicode release.
6816     main::set_access('parent', \%parent, 'r');
6817
6818     my %children;
6819     # An array of any tables that have this one as their parent
6820     main::set_access('children', \%children, 'readable_array');
6821
6822     my %conflicting;
6823     # Array of any tables that would have the same name as this one with
6824     # a different meaning.  This is used for the generated documentation.
6825     main::set_access('conflicting', \%conflicting, 'readable_array');
6826
6827     my %matches_all;
6828     # Set in the constructor for tables that are expected to match all code
6829     # points.
6830     main::set_access('matches_all', \%matches_all, 'r');
6831
6832     my %complement;
6833     # Points to the complement that this table is expressed in terms of; 0 if
6834     # none.
6835     main::set_access('complement', \%complement, 'r');
6836
6837     sub new {
6838         my $class = shift;
6839
6840         my %args = @_;
6841
6842         # The property for which this table is a listing of property values.
6843         my $property = delete $args{'_Property'};
6844
6845         my $name = delete $args{'Name'};
6846         my $full_name = delete $args{'Full_Name'};
6847         $full_name = $name if ! defined $full_name;
6848
6849         # Optional
6850         my $initialize = delete $args{'Initialize'};
6851         my $matches_all = delete $args{'Matches_All'} || 0;
6852         my $format = delete $args{'Format'};
6853         # Rest of parameters passed on.
6854
6855         my $range_list = Range_List->new(Initialize => $initialize,
6856                                          Owner => $property);
6857
6858         my $complete = $full_name;
6859         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
6860                                               # but this helps debug if it
6861                                               # does
6862         # The complete name for a match table includes it's property in a
6863         # compound form 'property=table', except if the property is the
6864         # pseudo-property, perl, in which case it is just the single form,
6865         # 'table' (If you change the '=' must also change the ':' in lots of
6866         # places in this program that assume an equal sign)
6867         $complete = $property->full_name . "=$complete" if $property != $perl;
6868
6869         my $self = $class->SUPER::new(%args,
6870                                       Name => $name,
6871                                       Complete_Name => $complete,
6872                                       Full_Name => $full_name,
6873                                       _Property => $property,
6874                                       _Range_List => $range_list,
6875                                       Format => $EMPTY_FORMAT,
6876                                       );
6877         my $addr = do { no overloading; pack 'J', $self; };
6878
6879         $conflicting{$addr} = [ ];
6880         $equivalents{$addr} = [ ];
6881         $children{$addr} = [ ];
6882         $matches_all{$addr} = $matches_all;
6883         $leader{$addr} = $self;
6884         $parent{$addr} = $self;
6885         $complement{$addr} = 0;
6886
6887         if (defined $format && $format ne $EMPTY_FORMAT) {
6888             Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
6889         }
6890
6891         return $self;
6892     }
6893
6894     # See this program's beginning comment block about overloading these.
6895     use overload
6896         fallback => 0,
6897         qw("") => "_operator_stringify",
6898         '=' => sub {
6899                     my $self = shift;
6900
6901                     return if $self->carp_if_locked;
6902                     return $self;
6903                 },
6904
6905         '+' => sub {
6906                         my $self = shift;
6907                         my $other = shift;
6908
6909                         return $self->_range_list + $other;
6910                     },
6911         '&' => sub {
6912                         my $self = shift;
6913                         my $other = shift;
6914
6915                         return $self->_range_list & $other;
6916                     },
6917         '+=' => sub {
6918                         my $self = shift;
6919                         my $other = shift;
6920                         my $reversed = shift;
6921
6922                         if ($reversed) {
6923                             Carp::my_carp_bug("Bad news.  Can't cope with '"
6924                             . ref($other)
6925                             . ' += '
6926                             . ref($self)
6927                             . "'.  undef returned.");
6928                             return;
6929                         }
6930
6931                         return if $self->carp_if_locked;
6932
6933                         my $addr = do { no overloading; pack 'J', $self; };
6934
6935                         if (ref $other) {
6936
6937                             # Change the range list of this table to be the
6938                             # union of the two.
6939                             $self->_set_range_list($self->_range_list
6940                                                     + $other);
6941                         }
6942                         else {    # $other is just a simple value
6943                             $self->add_range($other, $other);
6944                         }
6945                         return $self;
6946                     },
6947         '&=' => sub {
6948                         my $self = shift;
6949                         my $other = shift;
6950                         my $reversed = shift;
6951
6952                         if ($reversed) {
6953                             Carp::my_carp_bug("Bad news.  Can't cope with '"
6954                             . ref($other)
6955                             . ' &= '
6956                             . ref($self)
6957                             . "'.  undef returned.");
6958                             return;
6959                         }
6960
6961                         return if $self->carp_if_locked;
6962                         $self->_set_range_list($self->_range_list & $other);
6963                         return $self;
6964                     },
6965         '-' => sub { my $self = shift;
6966                     my $other = shift;
6967                     my $reversed = shift;
6968                     if ($reversed) {
6969                         Carp::my_carp_bug("Bad news.  Can't cope with '"
6970                         . ref($other)
6971                         . ' - '
6972                         . ref($self)
6973                         . "'.  undef returned.");
6974                         return;
6975                     }
6976
6977                     return $self->_range_list - $other;
6978                 },
6979         '~' => sub { my $self = shift;
6980                     return ~ $self->_range_list;
6981                 },
6982     ;
6983
6984     sub _operator_stringify {
6985         my $self = shift;
6986
6987         my $name = $self->complete_name;
6988         return "Table '$name'";
6989     }
6990
6991     sub _range_list {
6992         # Returns the range list associated with this table, which will be the
6993         # complement's if it has one.
6994
6995         my $self = shift;
6996         my $complement;
6997         if (($complement = $self->complement) != 0) {
6998             return ~ $complement->_range_list;
6999         }
7000         else {
7001             return $self->SUPER::_range_list;
7002         }
7003     }
7004
7005     sub add_alias {
7006         # Add a synonym for this table.  See the comments in the base class
7007
7008         my $self = shift;
7009         my $name = shift;
7010         # Rest of parameters passed on.
7011
7012         $self->SUPER::add_alias($name, $self, @_);
7013         return;
7014     }
7015
7016     sub add_conflicting {
7017         # Add the name of some other object to the list of ones that name
7018         # clash with this match table.
7019
7020         my $self = shift;
7021         my $conflicting_name = shift;   # The name of the conflicting object
7022         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
7023         my $conflicting_object = shift; # Optional, the conflicting object
7024                                         # itself.  This is used to
7025                                         # disambiguate the text if the input
7026                                         # name is identical to any of the
7027                                         # aliases $self is known by.
7028                                         # Sometimes the conflicting object is
7029                                         # merely hypothetical, so this has to
7030                                         # be an optional parameter.
7031         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7032
7033         my $addr = do { no overloading; pack 'J', $self; };
7034
7035         # Check if the conflicting name is exactly the same as any existing
7036         # alias in this table (as long as there is a real object there to
7037         # disambiguate with).
7038         if (defined $conflicting_object) {
7039             foreach my $alias ($self->aliases) {
7040                 if ($alias->name eq $conflicting_name) {
7041
7042                     # Here, there is an exact match.  This results in
7043                     # ambiguous comments, so disambiguate by changing the
7044                     # conflicting name to its object's complete equivalent.
7045                     $conflicting_name = $conflicting_object->complete_name;
7046                     last;
7047                 }
7048             }
7049         }
7050
7051         # Convert to the \p{...} final name
7052         $conflicting_name = "\\$p" . "{$conflicting_name}";
7053
7054         # Only add once
7055         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7056
7057         push @{$conflicting{$addr}}, $conflicting_name;
7058
7059         return;
7060     }
7061
7062     sub is_set_equivalent_to {
7063         # Return boolean of whether or not the other object is a table of this
7064         # type and has been marked equivalent to this one.
7065
7066         my $self = shift;
7067         my $other = shift;
7068         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7069
7070         return 0 if ! defined $other; # Can happen for incomplete early
7071                                       # releases
7072         unless ($other->isa(__PACKAGE__)) {
7073             my $ref_other = ref $other;
7074             my $ref_self = ref $self;
7075             Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
7076             return 0;
7077         }
7078
7079         # Two tables are equivalent if they have the same leader.
7080         no overloading;
7081         return $leader{pack 'J', $self} == $leader{pack 'J', $other};
7082         return;
7083     }
7084
7085     sub set_equivalent_to {
7086         # Set $self equivalent to the parameter table.
7087         # The required Related => 'x' parameter is a boolean indicating
7088         # whether these tables are related or not.  If related, $other becomes
7089         # the 'parent' of $self; if unrelated it becomes the 'leader'
7090         #
7091         # Related tables share all characteristics except names; equivalents
7092         # not quite so many.
7093         # If they are related, one must be a perl extension.  This is because
7094         # we can't guarantee that Unicode won't change one or the other in a
7095         # later release even if they are identical now.
7096
7097         my $self = shift;
7098         my $other = shift;
7099
7100         my %args = @_;
7101         my $related = delete $args{'Related'};
7102
7103         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7104
7105         return if ! defined $other;     # Keep on going; happens in some early
7106                                         # Unicode releases.
7107
7108         if (! defined $related) {
7109             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
7110             $related = 0;
7111         }
7112
7113         # If already are equivalent, no need to re-do it;  if subroutine
7114         # returns null, it found an error, also do nothing
7115         my $are_equivalent = $self->is_set_equivalent_to($other);
7116         return if ! defined $are_equivalent || $are_equivalent;
7117
7118         my $addr = do { no overloading; pack 'J', $self; };
7119         my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7120
7121         if ($related) {
7122             if ($current_leader->perl_extension) {
7123                 if ($other->perl_extension) {
7124                     Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7125                     return;
7126                 }
7127             } elsif ($self->property != $other->property    # Depending on
7128                                                             # situation, might
7129                                                             # be better to use
7130                                                             # add_alias()
7131                                                             # instead for same
7132                                                             # property
7133                      && ! $other->perl_extension)
7134             {
7135                 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
7136                 $related = 0;
7137             }
7138         }
7139
7140         if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7141             Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
7142             return;
7143         }
7144
7145         my $leader = do { no overloading; pack 'J', $current_leader; };
7146         my $other_addr = do { no overloading; pack 'J', $other; };
7147
7148         # Any tables that are equivalent to or children of this table must now
7149         # instead be equivalent to or (children) to the new leader (parent),
7150         # still equivalent.  The equivalency includes their matches_all info,
7151         # and for related tables, their fate and status.
7152         # All related tables are of necessity equivalent, but the converse
7153         # isn't necessarily true
7154         my $status = $other->status;
7155         my $status_info = $other->status_info;
7156         my $fate = $other->fate;
7157         my $matches_all = $matches_all{other_addr};
7158         my $caseless_equivalent = $other->caseless_equivalent;
7159         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7160             next if $table == $other;
7161             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7162
7163             my $table_addr = do { no overloading; pack 'J', $table; };
7164             $leader{$table_addr} = $other;
7165             $matches_all{$table_addr} = $matches_all;
7166             $self->_set_range_list($other->_range_list);
7167             push @{$equivalents{$other_addr}}, $table;
7168             if ($related) {
7169                 $parent{$table_addr} = $other;
7170                 push @{$children{$other_addr}}, $table;
7171                 $table->set_status($status, $status_info);
7172
7173                 # This reason currently doesn't get exposed outside; otherwise
7174                 # would have to look up the parent's reason and use it instead.
7175                 $table->set_fate($fate, "Parent's fate");
7176
7177                 $self->set_caseless_equivalent($caseless_equivalent);
7178             }
7179         }
7180
7181         # Now that we've declared these to be equivalent, any changes to one
7182         # of the tables would invalidate that equivalency.
7183         $self->lock;
7184         $other->lock;
7185         return;
7186     }
7187
7188     sub set_complement {
7189         # Set $self to be the complement of the parameter table.  $self is
7190         # locked, as what it contains should all come from the other table.
7191
7192         my $self = shift;
7193         my $other = shift;
7194
7195         my %args = @_;
7196         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7197
7198         if ($other->complement != 0) {
7199             Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7200             return;
7201         }
7202         my $addr = do { no overloading; pack 'J', $self; };
7203         $complement{$addr} = $other;
7204         $self->lock;
7205         return;
7206     }
7207
7208     sub add_range { # Add a range to the list for this table.
7209         my $self = shift;
7210         # Rest of parameters passed on
7211
7212         return if $self->carp_if_locked;
7213         return $self->_range_list->add_range(@_);
7214     }
7215
7216     sub header {
7217         my $self = shift;
7218         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7219
7220         # All match tables are to be used only by the Perl core.
7221         return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
7222     }
7223
7224     sub pre_body {  # Does nothing for match tables.
7225         return
7226     }
7227
7228     sub append_to_body {  # Does nothing for match tables.
7229         return
7230     }
7231
7232     sub set_fate {
7233         my $self = shift;
7234         my $fate = shift;
7235         my $reason = shift;
7236         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7237
7238         $self->SUPER::set_fate($fate, $reason);
7239
7240         # All children share this fate
7241         foreach my $child ($self->children) {
7242             $child->set_fate($fate, $reason);
7243         }
7244         return;
7245     }
7246
7247     sub write {
7248         my $self = shift;
7249         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7250
7251         return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops
7252     }
7253
7254     sub set_final_comment {
7255         # This creates a comment for the file that is to hold the match table
7256         # $self.  It is somewhat convoluted to make the English read nicely,
7257         # but, heh, it's just a comment.
7258         # This should be called only with the leader match table of all the
7259         # ones that share the same file.  It lists all such tables, ordered so
7260         # that related ones are together.
7261
7262         return unless $debugging_build;
7263
7264         my $leader = shift;   # Should only be called on the leader table of
7265                               # an equivalent group
7266         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7267
7268         my $addr = do { no overloading; pack 'J', $leader; };
7269
7270         if ($leader{$addr} != $leader) {
7271             Carp::my_carp_bug(<<END
7272 set_final_comment() must be called on a leader table, which $leader is not.
7273 It is equivalent to $leader{$addr}.  No comment created
7274 END
7275             );
7276             return;
7277         }
7278
7279         # Get the number of code points matched by each of the tables in this
7280         # file, and add underscores for clarity.
7281         my $count = $leader->count;
7282         my $string_count = main::clarify_number($count);
7283
7284         my $loose_count = 0;        # how many aliases loosely matched
7285         my $compound_name = "";     # ? Are any names compound?, and if so, an
7286                                     # example
7287         my $properties_with_compound_names = 0;    # count of these
7288
7289
7290         my %flags;              # The status flags used in the file
7291         my $total_entries = 0;  # number of entries written in the comment
7292         my $matches_comment = ""; # The portion of the comment about the
7293                                   # \p{}'s
7294         my @global_comments;    # List of all the tables' comments that are
7295                                 # there before this routine was called.
7296         my $has_ucd_alias = 0;  # If there is an alias that is accessible via
7297                                 # Unicode::UCD.  If not, then don't say it is
7298                                 # in the comment
7299
7300         # Get list of all the parent tables that are equivalent to this one
7301         # (including itself).
7302         my @parents = grep { $parent{main::objaddr $_} == $_ }
7303                             main::uniques($leader, @{$equivalents{$addr}});
7304         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
7305                                               # tables
7306
7307         for my $parent (@parents) {
7308
7309             my $property = $parent->property;
7310
7311             # Special case 'N' tables in properties with two match tables when
7312             # the other is a 'Y' one.  These are likely to be binary tables,
7313             # but not necessarily.  In either case, \P{} will match the
7314             # complement of \p{}, and so if something is a synonym of \p, the
7315             # complement of that something will be the synonym of \P.  This
7316             # would be true of any property with just two match tables, not
7317             # just those whose values are Y and N; but that would require a
7318             # little extra work, and there are none such so far in Unicode.
7319             my $perl_p = 'p';        # which is it?  \p{} or \P{}
7320             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
7321
7322             if (scalar $property->tables == 2
7323                 && $parent == $property->table('N')
7324                 && defined (my $yes = $property->table('Y')))
7325             {
7326                 my $yes_addr = do { no overloading; pack 'J', $yes; };
7327                 @yes_perl_synonyms
7328                     = grep { $_->property == $perl }
7329                                     main::uniques($yes,
7330                                                 $parent{$yes_addr},
7331                                                 $parent{$yes_addr}->children);
7332
7333                 # But these synonyms are \P{} ,not \p{}
7334                 $perl_p = 'P';
7335             }
7336
7337             my @description;        # Will hold the table description
7338             my @note;               # Will hold the table notes.
7339             my @conflicting;        # Will hold the table conflicts.
7340
7341             # Look at the parent, any yes synonyms, and all the children
7342             my $parent_addr = do { no overloading; pack 'J', $parent; };
7343             for my $table ($parent,
7344                            @yes_perl_synonyms,
7345                            @{$children{$parent_addr}})
7346             {
7347                 my $table_addr = do { no overloading; pack 'J', $table; };
7348                 my $table_property = $table->property;
7349
7350                 # Tables are separated by a blank line to create a grouping.
7351                 $matches_comment .= "\n" if $matches_comment;
7352
7353                 # The table is named based on the property and value
7354                 # combination it is for, like script=greek.  But there may be
7355                 # a number of synonyms for each side, like 'sc' for 'script',
7356                 # and 'grek' for 'greek'.  Any combination of these is a valid
7357                 # name for this table.  In this case, there are three more,
7358                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
7359                 # listing all possible combinations in the comment, we make
7360                 # sure that each synonym occurs at least once, and add
7361                 # commentary that the other combinations are possible.
7362                 # Because regular expressions don't recognize things like
7363                 # \p{jsn=}, only look at non-null right-hand-sides
7364                 my @property_aliases = $table_property->aliases;
7365                 my @table_aliases = grep { $_->name ne "" } $table->aliases;
7366
7367                 # The alias lists above are already ordered in the order we
7368                 # want to output them.  To ensure that each synonym is listed,
7369                 # we must use the max of the two numbers.  But if there are no
7370                 # legal synonyms (nothing in @table_aliases), then we don't
7371                 # list anything.
7372                 my $listed_combos = (@table_aliases)
7373                                     ?  main::max(scalar @table_aliases,
7374                                                  scalar @property_aliases)
7375                                     : 0;
7376                 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7377
7378
7379                 my $property_had_compound_name = 0;
7380
7381                 for my $i (0 .. $listed_combos - 1) {
7382                     $total_entries++;
7383
7384                     # The current alias for the property is the next one on
7385                     # the list, or if beyond the end, start over.  Similarly
7386                     # for the table (\p{prop=table})
7387                     my $property_alias = $property_aliases
7388                                             [$i % @property_aliases]->name;
7389                     my $table_alias_object = $table_aliases
7390                                                         [$i % @table_aliases];
7391                     my $table_alias = $table_alias_object->name;
7392                     my $loose_match = $table_alias_object->loose_match;
7393                     $has_ucd_alias |= $table_alias_object->ucd;
7394
7395                     if ($table_alias !~ /\D/) { # Clarify large numbers.
7396                         $table_alias = main::clarify_number($table_alias)
7397                     }
7398
7399                     # Add a comment for this alias combination
7400                     my $current_match_comment;
7401                     if ($table_property == $perl) {
7402                         $current_match_comment = "\\$perl_p"
7403                                                     . "{$table_alias}";
7404                     }
7405                     else {
7406                         $current_match_comment
7407                                         = "\\p{$property_alias=$table_alias}";
7408                         $property_had_compound_name = 1;
7409                     }
7410
7411                     # Flag any abnormal status for this table.
7412                     my $flag = $property->status
7413                                 || $table->status
7414                                 || $table_alias_object->status;
7415                     if ($flag && $flag ne $PLACEHOLDER) {
7416                         $flags{$flag} = $status_past_participles{$flag};
7417                     }
7418
7419                     $loose_count++;
7420
7421                     # Pretty up the comment.  Note the \b; it says don't make
7422                     # this line a continuation.
7423                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
7424                                         $flag,
7425                                         " " x 7,
7426                                         $current_match_comment);
7427                 } # End of generating the entries for this table.
7428
7429                 # Save these for output after this group of related tables.
7430                 push @description, $table->description;
7431                 push @note, $table->note;
7432                 push @conflicting, $table->conflicting;
7433
7434                 # And this for output after all the tables.
7435                 push @global_comments, $table->comment;
7436
7437                 # Compute an alternate compound name using the final property
7438                 # synonym and the first table synonym with a colon instead of
7439                 # the equal sign used elsewhere.
7440                 if ($property_had_compound_name) {
7441                     $properties_with_compound_names ++;
7442                     if (! $compound_name || @property_aliases > 1) {
7443                         $compound_name = $property_aliases[-1]->name
7444                                         . ': '
7445                                         . $table_aliases[0]->name;
7446                     }
7447                 }
7448             } # End of looping through all children of this table
7449
7450             # Here have assembled in $matches_comment all the related tables
7451             # to the current parent (preceded by the same info for all the
7452             # previous parents).  Put out information that applies to all of
7453             # the current family.
7454             if (@conflicting) {
7455
7456                 # But output the conflicting information now, as it applies to
7457                 # just this table.
7458                 my $conflicting = join ", ", @conflicting;
7459                 if ($conflicting) {
7460                     $matches_comment .= <<END;
7461
7462     Note that contrary to what you might expect, the above is NOT the same as
7463 END
7464                     $matches_comment .= "any of: " if @conflicting > 1;
7465                     $matches_comment .= "$conflicting\n";
7466                 }
7467             }
7468             if (@description) {
7469                 $matches_comment .= "\n    Meaning: "
7470                                     . join('; ', @description)
7471                                     . "\n";
7472             }
7473             if (@note) {
7474                 $matches_comment .= "\n    Note: "
7475                                     . join("\n    ", @note)
7476                                     . "\n";
7477             }
7478         } # End of looping through all tables
7479
7480
7481         my $code_points;
7482         my $match;
7483         my $any_of_these;
7484         if ($count == 1) {
7485             $match = 'matches';
7486             $code_points = 'single code point';
7487         }
7488         else {
7489             $match = 'match';
7490             $code_points = "$string_count code points";
7491         }
7492
7493         my $synonyms;
7494         my $entries;
7495         if ($total_entries == 1) {
7496             $synonyms = "";
7497             $entries = 'entry';
7498             $any_of_these = 'this'
7499         }
7500         else {
7501             $synonyms = " any of the following regular expression constructs";
7502             $entries = 'entries';
7503             $any_of_these = 'any of these'
7504         }
7505
7506         my $comment = "";
7507         if ($has_ucd_alias) {
7508             $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
7509         }
7510         if ($has_unrelated) {
7511             $comment .= <<END;
7512 This file is for tables that are not necessarily related:  To conserve
7513 resources, every table that matches the identical set of code points in this
7514 version of Unicode uses this file.  Each one is listed in a separate group
7515 below.  It could be that the tables will match the same set of code points in
7516 other Unicode releases, or it could be purely coincidence that they happen to
7517 be the same in Unicode $string_version, and hence may not in other versions.
7518
7519 END
7520         }
7521
7522         if (%flags) {
7523             foreach my $flag (sort keys %flags) {
7524                 $comment .= <<END;
7525 '$flag' below means that this form is $flags{$flag}.
7526 Consult $pod_file.pod
7527 END
7528             }
7529             $comment .= "\n";
7530         }
7531
7532         if ($total_entries == 0) {
7533             Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
7534             $comment .= <<END;
7535 This file returns the $code_points in Unicode Version $string_version for
7536 $leader, but it is inaccessible through Perl regular expressions, as
7537 "\\p{prop=}" is not recognized.
7538 END
7539
7540         } else {
7541             $comment .= <<END;
7542 This file returns the $code_points in Unicode Version $string_version that
7543 $match$synonyms:
7544
7545 $matches_comment
7546 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
7547 including if adding or subtracting white space, underscore, and hyphen
7548 characters matters or doesn't matter, and other permissible syntactic
7549 variants.  Upper/lower case distinctions never matter.
7550 END
7551
7552         }
7553         if ($compound_name) {
7554             $comment .= <<END;
7555
7556 A colon can be substituted for the equals sign, and
7557 END
7558             if ($properties_with_compound_names > 1) {
7559                 $comment .= <<END;
7560 within each group above,
7561 END
7562             }
7563             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7564
7565             # Note the \b below, it says don't make that line a continuation.
7566             $comment .= <<END;
7567 anything to the left of the equals (or colon) can be combined with anything to
7568 the right.  Thus, for example,
7569 $compound_name
7570 \bis also valid.
7571 END
7572         }
7573
7574         # And append any comment(s) from the actual tables.  They are all
7575         # gathered here, so may not read all that well.
7576         if (@global_comments) {
7577             $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7578         }
7579
7580         if ($count) {   # The format differs if no code points, and needs no
7581                         # explanation in that case
7582                 $comment.= <<END;
7583
7584 The format of the lines of this file is:
7585 END
7586             $comment.= <<END;
7587 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7588 STOP is the ending point, or if omitted, the range has just one code point.
7589 END
7590             if ($leader->output_range_counts) {
7591                 $comment .= <<END;
7592 Numbers in comments in [brackets] indicate how many code points are in the
7593 range.
7594 END
7595             }
7596         }
7597
7598         $leader->set_comment(main::join_lines($comment));
7599         return;
7600     }
7601
7602     # Accessors for the underlying list
7603     for my $sub (qw(
7604                     get_valid_code_point
7605                     get_invalid_code_point
7606                 ))
7607     {
7608         no strict "refs";
7609         *$sub = sub {
7610             use strict "refs";
7611             my $self = shift;
7612
7613             return $self->_range_list->$sub(@_);
7614         }
7615     }
7616 } # End closure for Match_Table
7617
7618 package Property;
7619
7620 # The Property class represents a Unicode property, or the $perl
7621 # pseudo-property.  It contains a map table initialized empty at construction
7622 # time, and for properties accessible through regular expressions, various
7623 # match tables, created through the add_match_table() method, and referenced
7624 # by the table('NAME') or tables() methods, the latter returning a list of all
7625 # of the match tables.  Otherwise table operations implicitly are for the map
7626 # table.
7627 #
7628 # Most of the data in the property is actually about its map table, so it
7629 # mostly just uses that table's accessors for most methods.  The two could
7630 # have been combined into one object, but for clarity because of their
7631 # differing semantics, they have been kept separate.  It could be argued that
7632 # the 'file' and 'directory' fields should be kept with the map table.
7633 #
7634 # Each property has a type.  This can be set in the constructor, or in the
7635 # set_type accessor, but mostly it is figured out by the data.  Every property
7636 # starts with unknown type, overridden by a parameter to the constructor, or
7637 # as match tables are added, or ranges added to the map table, the data is
7638 # inspected, and the type changed.  After the table is mostly or entirely
7639 # filled, compute_type() should be called to finalize they analysis.
7640 #
7641 # There are very few operations defined.  One can safely remove a range from
7642 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7643 # table to this one, replacing any in the intersection of the two.
7644
7645 sub standardize { return main::standardize($_[0]); }
7646 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7647
7648 {   # Closure
7649
7650     # This hash will contain as keys, all the aliases of all properties, and
7651     # as values, pointers to their respective property objects.  This allows
7652     # quick look-up of a property from any of its names.
7653     my %alias_to_property_of;
7654
7655     sub dump_alias_to_property_of {
7656         # For debugging
7657
7658         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7659         return;
7660     }
7661
7662     sub property_ref {
7663         # This is a package subroutine, not called as a method.
7664         # If the single parameter is a literal '*' it returns a list of all
7665         # defined properties.
7666         # Otherwise, the single parameter is a name, and it returns a pointer
7667         # to the corresponding property object, or undef if none.
7668         #
7669         # Properties can have several different names.  The 'standard' form of
7670         # each of them is stored in %alias_to_property_of as they are defined.
7671         # But it's possible that this subroutine will be called with some
7672         # variant, so if the initial lookup fails, it is repeated with the
7673         # standardized form of the input name.  If found, besides returning the
7674         # result, the input name is added to the list so future calls won't
7675         # have to do the conversion again.
7676
7677         my $name = shift;
7678
7679         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7680
7681         if (! defined $name) {
7682             Carp::my_carp_bug("Undefined input property.  No action taken.");
7683             return;
7684         }
7685
7686         return main::uniques(values %alias_to_property_of) if $name eq '*';
7687
7688         # Return cached result if have it.
7689         my $result = $alias_to_property_of{$name};
7690         return $result if defined $result;
7691
7692         # Convert the input to standard form.
7693         my $standard_name = standardize($name);
7694
7695         $result = $alias_to_property_of{$standard_name};
7696         return unless defined $result;        # Don't cache undefs
7697
7698         # Cache the result before returning it.
7699         $alias_to_property_of{$name} = $result;
7700         return $result;
7701     }
7702
7703
7704     main::setup_package();
7705
7706     my %map;
7707     # A pointer to the map table object for this property
7708     main::set_access('map', \%map);
7709
7710     my %full_name;
7711     # The property's full name.  This is a duplicate of the copy kept in the
7712     # map table, but is needed because stringify needs it during
7713     # construction of the map table, and then would have a chicken before egg
7714     # problem.
7715     main::set_access('full_name', \%full_name, 'r');
7716
7717     my %table_ref;
7718     # This hash will contain as keys, all the aliases of any match tables
7719     # attached to this property, and as values, the pointers to their
7720     # respective tables.  This allows quick look-up of a table from any of its
7721     # names.
7722     main::set_access('table_ref', \%table_ref);
7723
7724     my %type;
7725     # The type of the property, $ENUM, $BINARY, etc
7726     main::set_access('type', \%type, 'r');
7727
7728     my %file;
7729     # The filename where the map table will go (if actually written).
7730     # Normally defaulted, but can be overridden.
7731     main::set_access('file', \%file, 'r', 's');
7732
7733     my %directory;
7734     # The directory where the map table will go (if actually written).
7735     # Normally defaulted, but can be overridden.
7736     main::set_access('directory', \%directory, 's');
7737
7738     my %pseudo_map_type;
7739     # This is used to affect the calculation of the map types for all the
7740     # ranges in the table.  It should be set to one of the values that signify
7741     # to alter the calculation.
7742     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7743
7744     my %has_only_code_point_maps;
7745     # A boolean used to help in computing the type of data in the map table.
7746     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7747
7748     my %unique_maps;
7749     # A list of the first few distinct mappings this property has.  This is
7750     # used to disambiguate between binary and enum property types, so don't
7751     # have to keep more than three.
7752     main::set_access('unique_maps', \%unique_maps);
7753
7754     my %pre_declared_maps;
7755     # A boolean that gives whether the input data should declare all the
7756     # tables used, or not.  If the former, unknown ones raise a warning.
7757     main::set_access('pre_declared_maps',
7758                                     \%pre_declared_maps, 'r', 's');
7759
7760     sub new {
7761         # The only required parameter is the positionally first, name.  All
7762         # other parameters are key => value pairs.  See the documentation just
7763         # above for the meanings of the ones not passed directly on to the map
7764         # table constructor.
7765
7766         my $class = shift;
7767         my $name = shift || "";
7768
7769         my $self = property_ref($name);
7770         if (defined $self) {
7771             my $options_string = join ", ", @_;
7772             $options_string = ".  Ignoring options $options_string" if $options_string;
7773             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
7774             return $self;
7775         }
7776
7777         my %args = @_;
7778
7779         $self = bless \do { my $anonymous_scalar }, $class;
7780         my $addr = do { no overloading; pack 'J', $self; };
7781
7782         $directory{$addr} = delete $args{'Directory'};
7783         $file{$addr} = delete $args{'File'};
7784         $full_name{$addr} = delete $args{'Full_Name'} || $name;
7785         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7786         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
7787         $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7788                                     # Starting in this release, property
7789                                     # values should be defined for all
7790                                     # properties, except those overriding this
7791                                     // $v_version ge v5.1.0;
7792
7793         # Rest of parameters passed on.
7794
7795         $has_only_code_point_maps{$addr} = 1;
7796         $table_ref{$addr} = { };
7797         $unique_maps{$addr} = { };
7798
7799         $map{$addr} = Map_Table->new($name,
7800                                     Full_Name => $full_name{$addr},
7801                                     _Alias_Hash => \%alias_to_property_of,
7802                                     _Property => $self,
7803                                     %args);
7804         return $self;
7805     }
7806
7807     # See this program's beginning comment block about overloading the copy
7808     # constructor.  Few operations are defined on properties, but a couple are
7809     # useful.  It is safe to take the inverse of a property, and to remove a
7810     # single code point from it.
7811     use overload
7812         fallback => 0,
7813         qw("") => "_operator_stringify",
7814         "." => \&main::_operator_dot,
7815         ".=" => \&main::_operator_dot_equal,
7816         '==' => \&main::_operator_equal,
7817         '!=' => \&main::_operator_not_equal,
7818         '=' => sub { return shift },
7819         '-=' => "_minus_and_equal",
7820     ;
7821
7822     sub _operator_stringify {
7823         return "Property '" .  shift->full_name . "'";
7824     }
7825
7826     sub _minus_and_equal {
7827         # Remove a single code point from the map table of a property.
7828
7829         my $self = shift;
7830         my $other = shift;
7831         my $reversed = shift;
7832         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7833
7834         if (ref $other) {
7835             Carp::my_carp_bug("Bad news.  Can't cope with a "
7836                         . ref($other)
7837                         . " argument to '-='.  Subtraction ignored.");
7838             return $self;
7839         }
7840         elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
7841             Carp::my_carp_bug("Bad news.  Can't cope with subtracting a "
7842             . ref $self
7843             . " from a non-object.  undef returned.");
7844             return;
7845         }
7846         else {
7847             no overloading;
7848             $map{pack 'J', $self}->delete_range($other, $other);
7849         }
7850         return $self;
7851     }
7852
7853     sub add_match_table {
7854         # Add a new match table for this property, with name given by the
7855         # parameter.  It returns a pointer to the table.
7856
7857         my $self = shift;
7858         my $name = shift;
7859         my %args = @_;
7860
7861         my $addr = do { no overloading; pack 'J', $self; };
7862
7863         my $table = $table_ref{$addr}{$name};
7864         my $standard_name = main::standardize($name);
7865         if (defined $table
7866             || (defined ($table = $table_ref{$addr}{$standard_name})))
7867         {
7868             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
7869             $table_ref{$addr}{$name} = $table;
7870             return $table;
7871         }
7872         else {
7873
7874             # See if this is a perl extension, if not passed in.
7875             my $perl_extension = delete $args{'Perl_Extension'};
7876             $perl_extension
7877                         = $self->perl_extension if ! defined $perl_extension;
7878
7879             $table = Match_Table->new(
7880                                 Name => $name,
7881                                 Perl_Extension => $perl_extension,
7882                                 _Alias_Hash => $table_ref{$addr},
7883                                 _Property => $self,
7884
7885                                 # gets property's fate and status by default
7886                                 Fate => $self->fate,
7887                                 Status => $self->status,
7888                                 _Status_Info => $self->status_info,
7889                                 %args);
7890             return unless defined $table;
7891         }
7892
7893         # Save the names for quick look up
7894         $table_ref{$addr}{$standard_name} = $table;
7895         $table_ref{$addr}{$name} = $table;
7896
7897         # Perhaps we can figure out the type of this property based on the
7898         # fact of adding this match table.  First, string properties don't
7899         # have match tables; second, a binary property can't have 3 match
7900         # tables
7901         if ($type{$addr} == $UNKNOWN) {
7902             $type{$addr} = $NON_STRING;
7903         }
7904         elsif ($type{$addr} == $STRING) {
7905             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
7906             $type{$addr} = $NON_STRING;
7907         }
7908         elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
7909             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7910                 && $type{$addr} == $BINARY)
7911             {
7912                 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
7913                 $type{$addr} = $ENUM;
7914             }
7915         }
7916
7917         return $table;
7918     }
7919
7920     sub delete_match_table {
7921         # Delete the table referred to by $2 from the property $1.
7922
7923         my $self = shift;
7924         my $table_to_remove = shift;
7925         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7926
7927         my $addr = do { no overloading; pack 'J', $self; };
7928
7929         # Remove all names that refer to it.
7930         foreach my $key (keys %{$table_ref{$addr}}) {
7931             delete $table_ref{$addr}{$key}
7932                                 if $table_ref{$addr}{$key} == $table_to_remove;
7933         }
7934
7935         $table_to_remove->DESTROY;
7936         return;
7937     }
7938
7939     sub table {
7940         # Return a pointer to the match table (with name given by the
7941         # parameter) associated with this property; undef if none.
7942
7943         my $self = shift;
7944         my $name = shift;
7945         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7946
7947         my $addr = do { no overloading; pack 'J', $self; };
7948
7949         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7950
7951         # If quick look-up failed, try again using the standard form of the
7952         # input name.  If that succeeds, cache the result before returning so
7953         # won't have to standardize this input name again.
7954         my $standard_name = main::standardize($name);
7955         return unless defined $table_ref{$addr}{$standard_name};
7956
7957         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7958         return $table_ref{$addr}{$name};
7959     }
7960
7961     sub tables {
7962         # Return a list of pointers to all the match tables attached to this
7963         # property
7964
7965         no overloading;
7966         return main::uniques(values %{$table_ref{pack 'J', shift}});
7967     }
7968
7969     sub directory {
7970         # Returns the directory the map table for this property should be
7971         # output in.  If a specific directory has been specified, that has
7972         # priority;  'undef' is returned if the type isn't defined;
7973         # or $map_directory for everything else.
7974
7975         my $addr = do { no overloading; pack 'J', shift; };
7976
7977         return $directory{$addr} if defined $directory{$addr};
7978         return undef if $type{$addr} == $UNKNOWN;
7979         return $map_directory;
7980     }
7981
7982     sub swash_name {
7983         # Return the name that is used to both:
7984         #   1)  Name the file that the map table is written to.
7985         #   2)  The name of swash related stuff inside that file.
7986         # The reason for this is that the Perl core historically has used
7987         # certain names that aren't the same as the Unicode property names.
7988         # To continue using these, $file is hard-coded in this file for those,
7989         # but otherwise the standard name is used.  This is different from the
7990         # external_name, so that the rest of the files, like in lib can use
7991         # the standard name always, without regard to historical precedent.
7992
7993         my $self = shift;
7994         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7995
7996         my $addr = do { no overloading; pack 'J', $self; };
7997
7998         # Swash names are used only on regular map tables; otherwise there
7999         # should be no access to the property map table from other parts of
8000         # Perl.
8001         return if $map{$addr}->fate != $ORDINARY;
8002
8003         return $file{$addr} if defined $file{$addr};
8004         return $map{$addr}->external_name;
8005     }
8006
8007     sub to_create_match_tables {
8008         # Returns a boolean as to whether or not match tables should be
8009         # created for this property.
8010
8011         my $self = shift;
8012         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8013
8014         # The whole point of this pseudo property is match tables.
8015         return 1 if $self == $perl;
8016
8017         my $addr = do { no overloading; pack 'J', $self; };
8018
8019         # Don't generate tables of code points that match the property values
8020         # of a string property.  Such a list would most likely have many
8021         # property values, each with just one or very few code points mapping
8022         # to it.
8023         return 0 if $type{$addr} == $STRING;
8024
8025         # Don't generate anything for unimplemented properties.
8026         return 0 if grep { $self->complete_name eq $_ }
8027                                                     @unimplemented_properties;
8028         # Otherwise, do.
8029         return 1;
8030     }
8031
8032     sub property_add_or_replace_non_nulls {
8033         # This adds the mappings in the property $other to $self.  Non-null
8034         # mappings from $other override those in $self.  It essentially merges
8035         # the two properties, with the second having priority except for null
8036         # mappings.
8037
8038         my $self = shift;
8039         my $other = shift;
8040         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8041
8042         if (! $other->isa(__PACKAGE__)) {
8043             Carp::my_carp_bug("$other should be a "
8044                             . __PACKAGE__
8045                             . ".  Not a '"
8046                             . ref($other)
8047                             . "'.  Not added;");
8048             return;
8049         }
8050
8051         no overloading;
8052         return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
8053     }
8054
8055     sub set_proxy_for {
8056         # Certain tables are not generally written out to files, but
8057         # Unicode::UCD has the intelligence to know that the file for $self
8058         # can be used to reconstruct those tables.  This routine just changes
8059         # things so that UCD pod entries for those suppressed tables are
8060         # generated, so the fact that a proxy is used is invisible to the
8061         # user.
8062
8063         my $self = shift;
8064
8065         foreach my $property_name (@_) {
8066             my $ref = property_ref($property_name);
8067             next if $ref->to_output_map;
8068             $ref->set_fate($MAP_PROXIED);
8069         }
8070     }
8071
8072     sub set_type {
8073         # Set the type of the property.  Mostly this is figured out by the
8074         # data in the table.  But this is used to set it explicitly.  The
8075         # reason it is not a standard accessor is that when setting a binary
8076         # property, we need to make sure that all the true/false aliases are
8077         # present, as they were omitted in early Unicode releases.
8078
8079         my $self = shift;
8080         my $type = shift;
8081         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8082
8083         if ($type != $ENUM
8084             && $type != $BINARY
8085             && $type != $FORCED_BINARY
8086             && $type != $STRING)
8087         {
8088             Carp::my_carp("Unrecognized type '$type'.  Type not set");
8089             return;
8090         }
8091
8092         { no overloading; $type{pack 'J', $self} = $type; }
8093         return if $type != $BINARY && $type != $FORCED_BINARY;
8094
8095         my $yes = $self->table('Y');
8096         $yes = $self->table('Yes') if ! defined $yes;
8097         $yes = $self->add_match_table('Y', Full_Name => 'Yes')
8098                                                             if ! defined $yes;
8099
8100         # Add aliases in order wanted, duplicates will be ignored.  We use a
8101         # binary property present in all releases for its ordered lists of
8102         # true/false aliases.  Note, that could run into problems in
8103         # outputting things in that we don't distinguish between the name and
8104         # full name of these.  Hopefully, if the table was already created
8105         # before this code is executed, it was done with these set properly.
8106         my $bm = property_ref("Bidi_Mirrored");
8107         foreach my $alias ($bm->table("Y")->aliases) {
8108             $yes->add_alias($alias->name);
8109         }
8110         my $no = $self->table('N');
8111         $no = $self->table('No') if ! defined $no;
8112         $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
8113         foreach my $alias ($bm->table("N")->aliases) {
8114             $no->add_alias($alias->name);
8115         }
8116
8117         return;
8118     }
8119
8120     sub add_map {
8121         # Add a map to the property's map table.  This also keeps
8122         # track of the maps so that the property type can be determined from
8123         # its data.
8124
8125         my $self = shift;
8126         my $start = shift;  # First code point in range
8127         my $end = shift;    # Final code point in range
8128         my $map = shift;    # What the range maps to.
8129         # Rest of parameters passed on.
8130
8131         my $addr = do { no overloading; pack 'J', $self; };
8132
8133         # If haven't the type of the property, gather information to figure it
8134         # out.
8135         if ($type{$addr} == $UNKNOWN) {
8136
8137             # If the map contains an interior blank or dash, or most other
8138             # nonword characters, it will be a string property.  This
8139             # heuristic may actually miss some string properties.  If so, they
8140             # may need to have explicit set_types called for them.  This
8141             # happens in the Unihan properties.
8142             if ($map =~ / (?<= . ) [ -] (?= . ) /x
8143                 || $map =~ / [^\w.\/\ -]  /x)
8144             {
8145                 $self->set_type($STRING);
8146
8147                 # $unique_maps is used for disambiguating between ENUM and
8148                 # BINARY later; since we know the property is not going to be
8149                 # one of those, no point in keeping the data around
8150                 undef $unique_maps{$addr};
8151             }
8152             else {
8153
8154                 # Not necessarily a string.  The final decision has to be
8155                 # deferred until all the data are in.  We keep track of if all
8156                 # the values are code points for that eventual decision.
8157                 $has_only_code_point_maps{$addr} &=
8158                                             $map =~ / ^ $code_point_re $/x;
8159
8160                 # For the purposes of disambiguating between binary and other
8161                 # enumerations at the end, we keep track of the first three
8162                 # distinct property values.  Once we get to three, we know
8163                 # it's not going to be binary, so no need to track more.
8164                 if (scalar keys %{$unique_maps{$addr}} < 3) {
8165                     $unique_maps{$addr}{main::standardize($map)} = 1;
8166                 }
8167             }
8168         }
8169
8170         # Add the mapping by calling our map table's method
8171         return $map{$addr}->add_map($start, $end, $map, @_);
8172     }
8173
8174     sub compute_type {
8175         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
8176         # should be called after the property is mostly filled with its maps.
8177         # We have been keeping track of what the property values have been,
8178         # and now have the necessary information to figure out the type.
8179
8180         my $self = shift;
8181         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8182
8183         my $addr = do { no overloading; pack 'J', $self; };
8184
8185         my $type = $type{$addr};
8186
8187         # If already have figured these out, no need to do so again, but we do
8188         # a double check on ENUMS to make sure that a string property hasn't
8189         # improperly been classified as an ENUM, so continue on with those.
8190         return if $type == $STRING
8191                   || $type == $BINARY
8192                   || $type == $FORCED_BINARY;
8193
8194         # If every map is to a code point, is a string property.
8195         if ($type == $UNKNOWN
8196             && ($has_only_code_point_maps{$addr}
8197                 || (defined $map{$addr}->default_map
8198                     && $map{$addr}->default_map eq "")))
8199         {
8200             $self->set_type($STRING);
8201         }
8202         else {
8203
8204             # Otherwise, it is to some sort of enumeration.  (The case where
8205             # it is a Unicode miscellaneous property, and treated like a
8206             # string in this program is handled in add_map()).  Distinguish
8207             # between binary and some other enumeration type.  Of course, if
8208             # there are more than two values, it's not binary.  But more
8209             # subtle is the test that the default mapping is defined means it
8210             # isn't binary.  This in fact may change in the future if Unicode
8211             # changes the way its data is structured.  But so far, no binary
8212             # properties ever have @missing lines for them, so the default map
8213             # isn't defined for them.  The few properties that are two-valued
8214             # and aren't considered binary have the default map defined
8215             # starting in Unicode 5.0, when the @missing lines appeared; and
8216             # this program has special code to put in a default map for them
8217             # for earlier than 5.0 releases.
8218             if ($type == $ENUM
8219                 || scalar keys %{$unique_maps{$addr}} > 2
8220                 || defined $self->default_map)
8221             {
8222                 my $tables = $self->tables;
8223                 my $count = $self->count;
8224                 if ($verbosity && $count > 500 && $tables/$count > .1) {
8225                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
8226                 }
8227                 $self->set_type($ENUM);
8228             }
8229             else {
8230                 $self->set_type($BINARY);
8231             }
8232         }
8233         undef $unique_maps{$addr};  # Garbage collect
8234         return;
8235     }
8236
8237     sub set_fate {
8238         my $self = shift;
8239         my $fate = shift;
8240         my $reason = shift;  # Ignored unless suppressing
8241         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8242
8243         my $addr = do { no overloading; pack 'J', $self; };
8244         if ($fate == $SUPPRESSED) {
8245             $why_suppressed{$self->complete_name} = $reason;
8246         }
8247
8248         # Each table shares the property's fate, except that MAP_PROXIED
8249         # doesn't affect match tables
8250         $map{$addr}->set_fate($fate, $reason);
8251         if ($fate != $MAP_PROXIED) {
8252             foreach my $table ($map{$addr}, $self->tables) {
8253                 $table->set_fate($fate, $reason);
8254             }
8255         }
8256         return;
8257     }
8258
8259
8260     # Most of the accessors for a property actually apply to its map table.
8261     # Setup up accessor functions for those, referring to %map
8262     for my $sub (qw(
8263                     add_alias
8264                     add_anomalous_entry
8265                     add_comment
8266                     add_conflicting
8267                     add_description
8268                     add_duplicate
8269                     add_note
8270                     aliases
8271                     comment
8272                     complete_name
8273                     containing_range
8274                     count
8275                     default_map
8276                     delete_range
8277                     description
8278                     each_range
8279                     external_name
8280                     fate
8281                     file_path
8282                     format
8283                     initialize
8284                     inverse_list
8285                     is_empty
8286                     name
8287                     note
8288                     perl_extension
8289                     property
8290                     range_count
8291                     ranges
8292                     range_size_1
8293                     reset_each_range
8294                     set_comment
8295                     set_default_map
8296                     set_file_path
8297                     set_final_comment
8298                     _set_format
8299                     set_range_size_1
8300                     set_status
8301                     set_to_output_map
8302                     short_name
8303                     status
8304                     status_info
8305                     to_output_map
8306                     type_of
8307                     value_of
8308                     write
8309                 ))
8310                     # 'property' above is for symmetry, so that one can take
8311                     # the property of a property and get itself, and so don't
8312                     # have to distinguish between properties and tables in
8313                     # calling code
8314     {
8315         no strict "refs";
8316         *$sub = sub {
8317             use strict "refs";
8318             my $self = shift;
8319             no overloading;
8320             return $map{pack 'J', $self}->$sub(@_);
8321         }
8322     }
8323
8324
8325 } # End closure
8326
8327 package main;
8328
8329 sub join_lines($) {
8330     # Returns lines of the input joined together, so that they can be folded
8331     # properly.
8332     # This causes continuation lines to be joined together into one long line
8333     # for folding.  A continuation line is any line that doesn't begin with a
8334     # space or "\b" (the latter is stripped from the output).  This is so
8335     # lines can be be in a HERE document so as to fit nicely in the terminal
8336     # width, but be joined together in one long line, and then folded with
8337     # indents, '#' prefixes, etc, properly handled.
8338     # A blank separates the joined lines except if there is a break; an extra
8339     # blank is inserted after a period ending a line.
8340
8341     # Initialize the return with the first line.
8342     my ($return, @lines) = split "\n", shift;
8343
8344     # If the first line is null, it was an empty line, add the \n back in
8345     $return = "\n" if $return eq "";
8346
8347     # Now join the remainder of the physical lines.
8348     for my $line (@lines) {
8349
8350         # An empty line means wanted a blank line, so add two \n's to get that
8351         # effect, and go to the next line.
8352         if (length $line == 0) {
8353             $return .= "\n\n";
8354             next;
8355         }
8356
8357         # Look at the last character of what we have so far.
8358         my $previous_char = substr($return, -1, 1);
8359
8360         # And at the next char to be output.
8361         my $next_char = substr($line, 0, 1);
8362
8363         if ($previous_char ne "\n") {
8364
8365             # Here didn't end wth a nl.  If the next char a blank or \b, it
8366             # means that here there is a break anyway.  So add a nl to the
8367             # output.
8368             if ($next_char eq " " || $next_char eq "\b") {
8369                 $previous_char = "\n";
8370                 $return .= $previous_char;
8371             }
8372
8373             # Add an extra space after periods.
8374             $return .= " " if $previous_char eq '.';
8375         }
8376
8377         # Here $previous_char is still the latest character to be output.  If
8378         # it isn't a nl, it means that the next line is to be a continuation
8379         # line, with a blank inserted between them.
8380         $return .= " " if $previous_char ne "\n";
8381
8382         # Get rid of any \b
8383         substr($line, 0, 1) = "" if $next_char eq "\b";
8384
8385         # And append this next line.
8386         $return .= $line;
8387     }
8388
8389     return $return;
8390 }
8391
8392 sub simple_fold($;$$$) {
8393     # Returns a string of the input (string or an array of strings) folded
8394     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8395     # a \n
8396     # This is tailored for the kind of text written by this program,
8397     # especially the pod file, which can have very long names with
8398     # underscores in the middle, or words like AbcDefgHij....  We allow
8399     # breaking in the middle of such constructs if the line won't fit
8400     # otherwise.  The break in such cases will come either just after an
8401     # underscore, or just before one of the Capital letters.
8402
8403     local $to_trace = 0 if main::DEBUG;
8404
8405     my $line = shift;
8406     my $prefix = shift;     # Optional string to prepend to each output
8407                             # line
8408     $prefix = "" unless defined $prefix;
8409
8410     my $hanging_indent = shift; # Optional number of spaces to indent
8411                                 # continuation lines
8412     $hanging_indent = 0 unless $hanging_indent;
8413
8414     my $right_margin = shift;   # Optional number of spaces to narrow the
8415                                 # total width by.
8416     $right_margin = 0 unless defined $right_margin;
8417
8418     # Call carp with the 'nofold' option to avoid it from trying to call us
8419     # recursively
8420     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8421
8422     # The space available doesn't include what's automatically prepended
8423     # to each line, or what's reserved on the right.
8424     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8425     # XXX Instead of using the 'nofold' perhaps better to look up the stack
8426
8427     if (DEBUG && $hanging_indent >= $max) {
8428         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
8429         $hanging_indent = 0;
8430     }
8431
8432     # First, split into the current physical lines.
8433     my @line;
8434     if (ref $line) {        # Better be an array, because not bothering to
8435                             # test
8436         foreach my $line (@{$line}) {
8437             push @line, split /\n/, $line;
8438         }
8439     }
8440     else {
8441         @line = split /\n/, $line;
8442     }
8443
8444     #local $to_trace = 1 if main::DEBUG;
8445     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8446
8447     # Look at each current physical line.
8448     for (my $i = 0; $i < @line; $i++) {
8449         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8450         #local $to_trace = 1 if main::DEBUG;
8451         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8452
8453         # Remove prefix, because will be added back anyway, don't want
8454         # doubled prefix
8455         $line[$i] =~ s/^$prefix//;
8456
8457         # Remove trailing space
8458         $line[$i] =~ s/\s+\Z//;
8459
8460         # If the line is too long, fold it.
8461         if (length $line[$i] > $max) {
8462             my $remainder;
8463
8464             # Here needs to fold.  Save the leading space in the line for
8465             # later.
8466             $line[$i] =~ /^ ( \s* )/x;
8467             my $leading_space = $1;
8468             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8469
8470             # If character at final permissible position is white space,
8471             # fold there, which will delete that white space
8472             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8473                 $remainder = substr($line[$i], $max);
8474                 $line[$i] = substr($line[$i], 0, $max - 1);
8475             }
8476             else {
8477
8478                 # Otherwise fold at an acceptable break char closest to
8479                 # the max length.  Look at just the maximal initial
8480                 # segment of the line
8481                 my $segment = substr($line[$i], 0, $max - 1);
8482                 if ($segment =~
8483                     /^ ( .{$hanging_indent}   # Don't look before the
8484                                               #  indent.
8485                         \ *                   # Don't look in leading
8486                                               #  blanks past the indent
8487                             [^ ] .*           # Find the right-most
8488                         (?:                   #  acceptable break:
8489                             [ \s = ]          # space or equal
8490                             | - (?! [.0-9] )  # or non-unary minus.
8491                         )                     # $1 includes the character
8492                     )/x)
8493                 {
8494                     # Split into the initial part that fits, and remaining
8495                     # part of the input
8496                     $remainder = substr($line[$i], length $1);
8497                     $line[$i] = $1;
8498                     trace $line[$i] if DEBUG && $to_trace;
8499                     trace $remainder if DEBUG && $to_trace;
8500                 }
8501
8502                 # If didn't find a good breaking spot, see if there is a
8503                 # not-so-good breaking spot.  These are just after
8504                 # underscores or where the case changes from lower to
8505                 # upper.  Use \a as a soft hyphen, but give up
8506                 # and don't break the line if there is actually a \a
8507                 # already in the input.  We use an ascii character for the
8508                 # soft-hyphen to avoid any attempt by miniperl to try to
8509                 # access the files that this program is creating.
8510                 elsif ($segment !~ /\a/
8511                        && ($segment =~ s/_/_\a/g
8512                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8513                 {
8514                     # Here were able to find at least one place to insert
8515                     # our substitute soft hyphen.  Find the right-most one
8516                     # and replace it by a real hyphen.
8517                     trace $segment if DEBUG && $to_trace;
8518                     substr($segment,
8519                             rindex($segment, "\a"),
8520                             1) = '-';
8521
8522                     # Then remove the soft hyphen substitutes.
8523                     $segment =~ s/\a//g;
8524                     trace $segment if DEBUG && $to_trace;
8525
8526                     # And split into the initial part that fits, and
8527                     # remainder of the line
8528                     my $pos = rindex($segment, '-');
8529                     $remainder = substr($line[$i], $pos);
8530                     trace $remainder if DEBUG && $to_trace;
8531                     $line[$i] = substr($segment, 0, $pos + 1);
8532                 }
8533             }
8534
8535             # Here we know if we can fold or not.  If we can, $remainder
8536             # is what remains to be processed in the next iteration.
8537             if (defined $remainder) {
8538                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8539
8540                 # Insert the folded remainder of the line as a new element
8541                 # of the array.  (It may still be too long, but we will
8542                 # deal with that next time through the loop.)  Omit any
8543                 # leading space in the remainder.
8544                 $remainder =~ s/^\s+//;
8545                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8546
8547                 # But then indent by whichever is larger of:
8548                 # 1) the leading space on the input line;
8549                 # 2) the hanging indent.
8550                 # This preserves indentation in the original line.
8551                 my $lead = ($leading_space)
8552                             ? length $leading_space
8553                             : $hanging_indent;
8554                 $lead = max($lead, $hanging_indent);
8555                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8556             }
8557         }
8558
8559         # Ready to output the line. Get rid of any trailing space
8560         # And prefix by the required $prefix passed in.
8561         $line[$i] =~ s/\s+$//;
8562         $line[$i] = "$prefix$line[$i]\n";
8563     } # End of looping through all the lines.
8564
8565     return join "", @line;
8566 }
8567
8568 sub property_ref {  # Returns a reference to a property object.
8569     return Property::property_ref(@_);
8570 }
8571
8572 sub force_unlink ($) {
8573     my $filename = shift;
8574     return unless file_exists($filename);
8575     return if CORE::unlink($filename);
8576
8577     # We might need write permission
8578     chmod 0777, $filename;
8579     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
8580     return;
8581 }
8582
8583 sub write ($$@) {
8584     # Given a filename and references to arrays of lines, write the lines of
8585     # each array to the file
8586     # Filename can be given as an arrayref of directory names
8587
8588     return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
8589
8590     my $file  = shift;
8591     my $use_utf8 = shift;
8592
8593     # Get into a single string if an array, and get rid of, in Unix terms, any
8594     # leading '.'
8595     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8596     $file = File::Spec->canonpath($file);
8597
8598     # If has directories, make sure that they all exist
8599     (undef, my $directories, undef) = File::Spec->splitpath($file);
8600     File::Path::mkpath($directories) if $directories && ! -d $directories;
8601
8602     push @files_actually_output, $file;
8603
8604     force_unlink ($file);
8605
8606     my $OUT;
8607     if (not open $OUT, ">", $file) {
8608         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
8609         return;
8610     }
8611
8612     binmode $OUT, ":utf8" if $use_utf8;
8613
8614     while (defined (my $lines_ref = shift)) {
8615         unless (@$lines_ref) {
8616             Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8617         }
8618
8619         print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8620     }
8621     close $OUT or die Carp::my_carp("close '$file' failed: $!");
8622
8623     print "$file written.\n" if $verbosity >= $VERBOSE;
8624
8625     return;
8626 }
8627
8628
8629 sub Standardize($) {
8630     # This converts the input name string into a standardized equivalent to
8631     # use internally.
8632
8633     my $name = shift;
8634     unless (defined $name) {
8635       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
8636       return;
8637     }
8638
8639     # Remove any leading or trailing white space
8640     $name =~ s/^\s+//g;
8641     $name =~ s/\s+$//g;
8642
8643     # Convert interior white space and hyphens into underscores.
8644     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8645
8646     # Capitalize the letter following an underscore, and convert a sequence of
8647     # multiple underscores to a single one
8648     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8649
8650     # And capitalize the first letter, but not for the special cjk ones.
8651     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8652     return $name;
8653 }
8654
8655 sub standardize ($) {
8656     # Returns a lower-cased standardized name, without underscores.  This form
8657     # is chosen so that it can distinguish between any real versus superficial
8658     # Unicode name differences.  It relies on the fact that Unicode doesn't
8659     # have interior underscores, white space, nor dashes in any
8660     # stricter-matched name.  It should not be used on Unicode code point
8661     # names (the Name property), as they mostly, but not always follow these
8662     # rules.
8663
8664     my $name = Standardize(shift);
8665     return if !defined $name;
8666
8667     $name =~ s/ (?<= .) _ (?= . ) //xg;
8668     return lc $name;
8669 }
8670
8671 sub utf8_heavy_name ($$) {
8672     # Returns the name that utf8_heavy.pl will use to find a table.  XXX
8673     # perhaps this function should be placed somewhere, like Heavy.pl so that
8674     # utf8_heavy can use it directly without duplicating code that can get
8675     # out-of sync.
8676
8677     my $table = shift;
8678     my $alias = shift;
8679     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8680
8681     my $property = $table->property;
8682     $property = ($property == $perl)
8683                 ? ""                # 'perl' is never explicitly stated
8684                 : standardize($property->name) . '=';
8685     if ($alias->loose_match) {
8686         return $property . standardize($alias->name);
8687     }
8688     else {
8689         return lc ($property . $alias->name);
8690     }
8691
8692     return;
8693 }
8694
8695 {   # Closure
8696
8697     my $indent_increment = " " x (($debugging_build) ? 2 : 0);
8698     my %already_output;
8699
8700     $main::simple_dumper_nesting = 0;
8701
8702     sub simple_dumper {
8703         # Like Simple Data::Dumper. Good enough for our needs. We can't use
8704         # the real thing as we have to run under miniperl.
8705
8706         # It is designed so that on input it is at the beginning of a line,
8707         # and the final thing output in any call is a trailing ",\n".
8708
8709         my $item = shift;
8710         my $indent = shift;
8711         $indent = "" if ! $debugging_build || ! defined $indent;
8712
8713         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8714
8715         # nesting level is localized, so that as the call stack pops, it goes
8716         # back to the prior value.
8717         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8718         undef %already_output if $main::simple_dumper_nesting == 0;
8719         $main::simple_dumper_nesting++;
8720         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8721
8722         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8723
8724         # Determine the indent for recursive calls.
8725         my $next_indent = $indent . $indent_increment;
8726
8727         my $output;
8728         if (! ref $item) {
8729
8730             # Dump of scalar: just output it in quotes if not a number.  To do
8731             # so we must escape certain characters, and therefore need to
8732             # operate on a copy to avoid changing the original
8733             my $copy = $item;
8734             $copy = $UNDEF unless defined $copy;
8735
8736             # Quote non-integers (integers also have optional leading '-')
8737             if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
8738
8739                 # Escape apostrophe and backslash
8740                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8741                 $copy = "'$copy'";
8742             }
8743             $output = "$indent$copy,\n";
8744         }
8745         else {
8746
8747             # Keep track of cycles in the input, and refuse to infinitely loop
8748             my $addr = do { no overloading; pack 'J', $item; };
8749             if (defined $already_output{$addr}) {
8750                 return "${indent}ALREADY OUTPUT: $item\n";
8751             }
8752             $already_output{$addr} = $item;
8753
8754             if (ref $item eq 'ARRAY') {
8755                 my $using_brackets;
8756                 $output = $indent;
8757                 if ($main::simple_dumper_nesting > 1) {
8758                     $output .= '[';
8759                     $using_brackets = 1;
8760                 }
8761                 else {
8762                     $using_brackets = 0;
8763                 }
8764
8765                 # If the array is empty, put the closing bracket on the same
8766                 # line.  Otherwise, recursively add each array element
8767                 if (@$item == 0) {
8768                     $output .= " ";
8769                 }
8770                 else {
8771                     $output .= "\n";
8772                     for (my $i = 0; $i < @$item; $i++) {
8773
8774                         # Indent array elements one level
8775                         $output .= &simple_dumper($item->[$i], $next_indent);
8776                         next if ! $debugging_build;
8777                         $output =~ s/\n$//;      # Remove any trailing nl so
8778                         $output .= " # [$i]\n";  # as to add a comment giving
8779                                                  # the array index
8780                     }
8781                     $output .= $indent;     # Indent closing ']' to orig level
8782                 }
8783                 $output .= ']' if $using_brackets;
8784                 $output .= ",\n";
8785             }
8786             elsif (ref $item eq 'HASH') {
8787                 my $is_first_line;
8788                 my $using_braces;
8789                 my $body_indent;
8790
8791                 # No surrounding braces at top level
8792                 $output .= $indent;
8793                 if ($main::simple_dumper_nesting > 1) {
8794                     $output .= "{\n";
8795                     $is_first_line = 0;
8796                     $body_indent = $next_indent;
8797                     $next_indent .= $indent_increment;
8798                     $using_braces = 1;
8799                 }
8800                 else {
8801                     $is_first_line = 1;
8802                     $body_indent = $indent;
8803                     $using_braces = 0;
8804                 }
8805
8806                 # Output hashes sorted alphabetically instead of apparently
8807                 # random.  Use caseless alphabetic sort
8808                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8809                 {
8810                     if ($is_first_line) {
8811                         $is_first_line = 0;
8812                     }
8813                     else {
8814                         $output .= "$body_indent";
8815                     }
8816
8817                     # The key must be a scalar, but this recursive call quotes
8818                     # it
8819                     $output .= &simple_dumper($key);
8820
8821                     # And change the trailing comma and nl to the hash fat
8822                     # comma for clarity, and so the value can be on the same
8823                     # line
8824                     $output =~ s/,\n$/ => /;
8825
8826                     # Recursively call to get the value's dump.
8827                     my $next = &simple_dumper($item->{$key}, $next_indent);
8828
8829                     # If the value is all on one line, remove its indent, so
8830                     # will follow the => immediately.  If it takes more than
8831                     # one line, start it on a new line.
8832                     if ($next !~ /\n.*\n/) {
8833                         $next =~ s/^ *//;
8834                     }
8835                     else {
8836                         $output .= "\n";
8837                     }
8838                     $output .= $next;
8839                 }
8840
8841                 $output .= "$indent},\n" if $using_braces;
8842             }
8843             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8844                 $output = $indent . ref($item) . "\n";
8845                 # XXX see if blessed
8846             }
8847             elsif ($item->can('dump')) {
8848
8849                 # By convention in this program, objects furnish a 'dump'
8850                 # method.  Since not doing any output at this level, just pass
8851                 # on the input indent
8852                 $output = $item->dump($indent);
8853             }
8854             else {
8855                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
8856             }
8857         }
8858         return $output;
8859     }
8860 }
8861
8862 sub dump_inside_out {
8863     # Dump inside-out hashes in an object's state by converting them to a
8864     # regular hash and then calling simple_dumper on that.
8865
8866     my $object = shift;
8867     my $fields_ref = shift;
8868     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8869
8870     my $addr = do { no overloading; pack 'J', $object; };
8871
8872     my %hash;
8873     foreach my $key (keys %$fields_ref) {
8874         $hash{$key} = $fields_ref->{$key}{$addr};
8875     }
8876
8877     return simple_dumper(\%hash, @_);
8878 }
8879
8880 sub _operator_dot {
8881     # Overloaded '.' method that is common to all packages.  It uses the
8882     # package's stringify method.
8883
8884     my $self = shift;
8885     my $other = shift;
8886     my $reversed = shift;
8887     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8888
8889     $other = "" unless defined $other;
8890
8891     foreach my $which (\$self, \$other) {
8892         next unless ref $$which;
8893         if ($$which->can('_operator_stringify')) {
8894             $$which = $$which->_operator_stringify;
8895         }
8896         else {
8897             my $ref = ref $$which;
8898             my $addr = do { no overloading; pack 'J', $$which; };
8899             $$which = "$ref ($addr)";
8900         }
8901     }
8902     return ($reversed)
8903             ? "$other$self"
8904             : "$self$other";
8905 }
8906
8907 sub _operator_dot_equal {
8908     # Overloaded '.=' method that is common to all packages.
8909
8910     my $self = shift;
8911     my $other = shift;
8912     my $reversed = shift;
8913     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8914
8915     $other = "" unless defined $other;
8916
8917     if ($reversed) {
8918         return $other .= "$self";
8919     }
8920     else {
8921         return "$self" . "$other";
8922     }
8923 }
8924
8925 sub _operator_equal {
8926     # Generic overloaded '==' routine.  To be equal, they must be the exact
8927     # same object
8928
8929     my $self = shift;
8930     my $other = shift;
8931
8932     return 0 unless defined $other;
8933     return 0 unless ref $other;
8934     no overloading;
8935     return $self == $other;
8936 }
8937
8938 sub _operator_not_equal {
8939     my $self = shift;
8940     my $other = shift;
8941
8942     return ! _operator_equal($self, $other);
8943 }
8944
8945 sub process_PropertyAliases($) {
8946     # This reads in the PropertyAliases.txt file, which contains almost all
8947     # the character properties in Unicode and their equivalent aliases:
8948     # scf       ; Simple_Case_Folding         ; sfc
8949     #
8950     # Field 0 is the preferred short name for the property.
8951     # Field 1 is the full name.
8952     # Any succeeding ones are other accepted names.
8953
8954     my $file= shift;
8955     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8956
8957     # This whole file was non-existent in early releases, so use our own
8958     # internal one.
8959     $file->insert_lines(get_old_property_aliases())
8960                                                 if ! -e 'PropertyAliases.txt';
8961
8962     # Add any cjk properties that may have been defined.
8963     $file->insert_lines(@cjk_properties);
8964
8965     while ($file->next_line) {
8966
8967         my @data = split /\s*;\s*/;
8968
8969         my $full = $data[1];
8970
8971         my $this = Property->new($data[0], Full_Name => $full);
8972
8973         # Start looking for more aliases after these two.
8974         for my $i (2 .. @data - 1) {
8975             $this->add_alias($data[$i]);
8976         }
8977
8978     }
8979
8980     my $scf = property_ref("Simple_Case_Folding");
8981     $scf->add_alias("scf");
8982     $scf->add_alias("sfc");
8983
8984     return;
8985 }
8986
8987 sub finish_property_setup {
8988     # Finishes setting up after PropertyAliases.
8989
8990     my $file = shift;
8991     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8992
8993     # This entry was missing from this file in earlier Unicode versions
8994     if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
8995         Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8996     }
8997
8998     # These two properties must be defined in all releases so we can generate
8999     # the tables from them to make regex \X work, but suppress their output so
9000     # aren't application visible prior to releases where they should be
9001     if (! defined property_ref('GCB')) {
9002         Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
9003                       Fate => $PLACEHOLDER);
9004     }
9005     if (! defined property_ref('hst')) {
9006         Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
9007                       Fate => $PLACEHOLDER);
9008     }
9009
9010     # These are used so much, that we set globals for them.
9011     $gc = property_ref('General_Category');
9012     $block = property_ref('Block');
9013     $script = property_ref('Script');
9014
9015     # Perl adds this alias.
9016     $gc->add_alias('Category');
9017
9018     # Unicode::Normalize expects this file with this name and directory.
9019     my $ccc = property_ref('Canonical_Combining_Class');
9020     if (defined $ccc) {
9021         $ccc->set_file('CombiningClass');
9022         $ccc->set_directory(File::Spec->curdir());
9023     }
9024
9025     # These two properties aren't actually used in the core, but unfortunately
9026     # the names just above that are in the core interfere with these, so
9027     # choose different names.  These aren't a problem unless the map tables
9028     # for these files get written out.
9029     my $lowercase = property_ref('Lowercase');
9030     $lowercase->set_file('IsLower') if defined $lowercase;
9031     my $uppercase = property_ref('Uppercase');
9032     $uppercase->set_file('IsUpper') if defined $uppercase;
9033
9034     # Set up the hard-coded default mappings, but only on properties defined
9035     # for this release
9036     foreach my $property (keys %default_mapping) {
9037         my $property_object = property_ref($property);
9038         next if ! defined $property_object;
9039         my $default_map = $default_mapping{$property};
9040         $property_object->set_default_map($default_map);
9041
9042         # A map of <code point> implies the property is string.
9043         if ($property_object->type == $UNKNOWN
9044             && $default_map eq $CODE_POINT)
9045         {
9046             $property_object->set_type($STRING);
9047         }
9048     }
9049
9050     # The following use the Multi_Default class to create objects for
9051     # defaults.
9052
9053     # Bidi class has a complicated default, but the derived file takes care of
9054     # the complications, leaving just 'L'.
9055     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
9056         property_ref('Bidi_Class')->set_default_map('L');
9057     }
9058     else {
9059         my $default;
9060
9061         # The derived file was introduced in 3.1.1.  The values below are
9062         # taken from table 3-8, TUS 3.0
9063         my $default_R =
9064             'my $default = Range_List->new;
9065              $default->add_range(0x0590, 0x05FF);
9066              $default->add_range(0xFB1D, 0xFB4F);'
9067         ;
9068
9069         # The defaults apply only to unassigned characters
9070         $default_R .= '$gc->table("Unassigned") & $default;';
9071
9072         if ($v_version lt v3.0.0) {
9073             $default = Multi_Default->new(R => $default_R, 'L');
9074         }
9075         else {
9076
9077             # AL apparently not introduced until 3.0:  TUS 2.x references are
9078             # not on-line to check it out
9079             my $default_AL =
9080                 'my $default = Range_List->new;
9081                  $default->add_range(0x0600, 0x07BF);
9082                  $default->add_range(0xFB50, 0xFDFF);
9083                  $default->add_range(0xFE70, 0xFEFF);'
9084             ;
9085
9086             # Non-character code points introduced in this release; aren't AL
9087             if ($v_version ge 3.1.0) {
9088                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
9089             }
9090             $default_AL .= '$gc->table("Unassigned") & $default';
9091             $default = Multi_Default->new(AL => $default_AL,
9092                                           R => $default_R,
9093                                           'L');
9094         }
9095         property_ref('Bidi_Class')->set_default_map($default);
9096     }
9097
9098     # Joining type has a complicated default, but the derived file takes care
9099     # of the complications, leaving just 'U' (or Non_Joining), except the file
9100     # is bad in 3.1.0
9101     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
9102         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
9103             property_ref('Joining_Type')->set_default_map('Non_Joining');
9104         }
9105         else {
9106
9107             # Otherwise, there are not one, but two possibilities for the
9108             # missing defaults: T and U.
9109             # The missing defaults that evaluate to T are given by:
9110             # T = Mn + Cf - ZWNJ - ZWJ
9111             # where Mn and Cf are the general category values. In other words,
9112             # any non-spacing mark or any format control character, except
9113             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
9114             # WIDTH JOINER (joining type C).
9115             my $default = Multi_Default->new(
9116                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
9117                'Non_Joining');
9118             property_ref('Joining_Type')->set_default_map($default);
9119         }
9120     }
9121
9122     # Line break has a complicated default in early releases. It is 'Unknown'
9123     # for non-assigned code points; 'AL' for assigned.
9124     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
9125         my $lb = property_ref('Line_Break');
9126         if ($v_version gt 3.2.0) {
9127             $lb->set_default_map('Unknown');
9128         }
9129         else {
9130             my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
9131                                               'AL');
9132             $lb->set_default_map($default);
9133         }
9134
9135         # If has the URS property, make sure that the standard aliases are in
9136         # it, since not in the input tables in some versions.
9137         my $urs = property_ref('Unicode_Radical_Stroke');
9138         if (defined $urs) {
9139             $urs->add_alias('cjkRSUnicode');
9140             $urs->add_alias('kRSUnicode');
9141         }
9142     }
9143
9144     # For backwards compatibility with applications that may read the mapping
9145     # file directly (it was documented in 5.12 and 5.14 as being thusly
9146     # usable), keep it from being adjusted.  (range_size_1 is
9147     # used to force the traditional format.)
9148     if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
9149         $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
9150         $nfkc_cf->set_range_size_1(1);
9151     }
9152     if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
9153         $bmg->set_to_output_map($EXTERNAL_MAP);
9154         $bmg->set_range_size_1(1);
9155     }
9156
9157     property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
9158
9159     return;
9160 }
9161
9162 sub get_old_property_aliases() {
9163     # Returns what would be in PropertyAliases.txt if it existed in very old
9164     # versions of Unicode.  It was derived from the one in 3.2, and pared
9165     # down based on the data that was actually in the older releases.
9166     # An attempt was made to use the existence of files to mean inclusion or
9167     # not of various aliases, but if this was not sufficient, using version
9168     # numbers was resorted to.
9169
9170     my @return;
9171
9172     # These are to be used in all versions (though some are constructed by
9173     # this program if missing)
9174     push @return, split /\n/, <<'END';
9175 bc        ; Bidi_Class
9176 Bidi_M    ; Bidi_Mirrored
9177 cf        ; Case_Folding
9178 ccc       ; Canonical_Combining_Class
9179 dm        ; Decomposition_Mapping
9180 dt        ; Decomposition_Type
9181 gc        ; General_Category
9182 isc       ; ISO_Comment
9183 lc        ; Lowercase_Mapping
9184 na        ; Name
9185 na1       ; Unicode_1_Name
9186 nt        ; Numeric_Type
9187 nv        ; Numeric_Value
9188 scf       ; Simple_Case_Folding
9189 slc       ; Simple_Lowercase_Mapping
9190 stc       ; Simple_Titlecase_Mapping
9191 suc       ; Simple_Uppercase_Mapping
9192 tc        ; Titlecase_Mapping
9193 uc        ; Uppercase_Mapping
9194 END
9195
9196     if (-e 'Blocks.txt') {
9197         push @return, "blk       ; Block\n";
9198     }
9199     if (-e 'ArabicShaping.txt') {
9200         push @return, split /\n/, <<'END';
9201 jg        ; Joining_Group
9202 jt        ; Joining_Type
9203 END
9204     }
9205     if (-e 'PropList.txt') {
9206
9207         # This first set is in the original old-style proplist.
9208         push @return, split /\n/, <<'END';
9209 Bidi_C    ; Bidi_Control
9210 Dash      ; Dash
9211 Dia       ; Diacritic
9212 Ext       ; Extender
9213 Hex       ; Hex_Digit
9214 Hyphen    ; Hyphen
9215 IDC       ; ID_Continue
9216 Ideo      ; Ideographic
9217 Join_C    ; Join_Control
9218 Math      ; Math
9219 QMark     ; Quotation_Mark
9220 Term      ; Terminal_Punctuation
9221 WSpace    ; White_Space
9222 END
9223         # The next sets were added later
9224         if ($v_version ge v3.0.0) {
9225             push @return, split /\n/, <<'END';
9226 Upper     ; Uppercase
9227 Lower     ; Lowercase
9228 END
9229         }
9230         if ($v_version ge v3.0.1) {
9231             push @return, split /\n/, <<'END';
9232 NChar     ; Noncharacter_Code_Point
9233 END
9234         }
9235         # The next sets were added in the new-style
9236         if ($v_version ge v3.1.0) {
9237             push @return, split /\n/, <<'END';
9238 OAlpha    ; Other_Alphabetic
9239 OLower    ; Other_Lowercase
9240 OMath     ; Other_Math
9241 OUpper    ; Other_Uppercase
9242 END
9243         }
9244         if ($v_version ge v3.1.1) {
9245             push @return, "AHex      ; ASCII_Hex_Digit\n";
9246         }
9247     }
9248     if (-e 'EastAsianWidth.txt') {
9249         push @return, "ea        ; East_Asian_Width\n";
9250     }
9251     if (-e 'CompositionExclusions.txt') {
9252         push @return, "CE        ; Composition_Exclusion\n";
9253     }
9254     if (-e 'LineBreak.txt') {
9255         push @return, "lb        ; Line_Break\n";
9256     }
9257     if (-e 'BidiMirroring.txt') {
9258         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
9259     }
9260     if (-e 'Scripts.txt') {
9261         push @return, "sc        ; Script\n";
9262     }
9263     if (-e 'DNormalizationProps.txt') {
9264         push @return, split /\n/, <<'END';
9265 Comp_Ex   ; Full_Composition_Exclusion
9266 FC_NFKC   ; FC_NFKC_Closure
9267 NFC_QC    ; NFC_Quick_Check
9268 NFD_QC    ; NFD_Quick_Check
9269 NFKC_QC   ; NFKC_Quick_Check
9270 NFKD_QC   ; NFKD_Quick_Check
9271 XO_NFC    ; Expands_On_NFC
9272 XO_NFD    ; Expands_On_NFD
9273 XO_NFKC   ; Expands_On_NFKC
9274 XO_NFKD   ; Expands_On_NFKD
9275 END
9276     }
9277     if (-e 'DCoreProperties.txt') {
9278         push @return, split /\n/, <<'END';
9279 Alpha     ; Alphabetic
9280 IDS       ; ID_Start
9281 XIDC      ; XID_Continue
9282 XIDS      ; XID_Start
9283 END
9284         # These can also appear in some versions of PropList.txt
9285         push @return, "Lower     ; Lowercase\n"
9286                                     unless grep { $_ =~ /^Lower\b/} @return;
9287         push @return, "Upper     ; Uppercase\n"
9288                                     unless grep { $_ =~ /^Upper\b/} @return;
9289     }
9290
9291     # This flag requires the DAge.txt file to be copied into the directory.
9292     if (DEBUG && $compare_versions) {
9293         push @return, 'age       ; Age';
9294     }
9295
9296     return @return;
9297 }
9298
9299 sub process_PropValueAliases {
9300     # This file contains values that properties look like:
9301     # bc ; AL        ; Arabic_Letter
9302     # blk; n/a       ; Greek_And_Coptic                 ; Greek
9303     #
9304     # Field 0 is the property.
9305     # Field 1 is the short name of a property value or 'n/a' if no
9306     #                short name exists;
9307     # Field 2 is the full property value name;
9308     # Any other fields are more synonyms for the property value.
9309     # Purely numeric property values are omitted from the file; as are some
9310     # others, fewer and fewer in later releases
9311
9312     # Entries for the ccc property have an extra field before the
9313     # abbreviation:
9314     # ccc;   0; NR   ; Not_Reordered
9315     # It is the numeric value that the names are synonyms for.
9316
9317     # There are comment entries for values missing from this file:
9318     # # @missing: 0000..10FFFF; ISO_Comment; <none>
9319     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
9320
9321     my $file= shift;
9322     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9323
9324     # This whole file was non-existent in early releases, so use our own
9325     # internal one if necessary.
9326     if (! -e 'PropValueAliases.txt') {
9327         $file->insert_lines(get_old_property_value_aliases());
9328     }
9329
9330     if ($v_version lt 4.0.0) {
9331         $file->insert_lines(split /\n/, <<'END'
9332 hst; L                                ; Leading_Jamo
9333 hst; LV                               ; LV_Syllable
9334 hst; LVT                              ; LVT_Syllable
9335 hst; NA                               ; Not_Applicable
9336 hst; T                                ; Trailing_Jamo
9337 hst; V                                ; Vowel_Jamo
9338 END
9339         );
9340     }
9341     if ($v_version lt 4.1.0) {
9342         $file->insert_lines(split /\n/, <<'END'
9343 GCB; CN                               ; Control
9344 GCB; CR                               ; CR
9345 GCB; EX                               ; Extend
9346 GCB; L                                ; L
9347 GCB; LF                               ; LF
9348 GCB; LV                               ; LV
9349 GCB; LVT                              ; LVT
9350 GCB; T                                ; T
9351 GCB; V                                ; V
9352 GCB; XX                               ; Other
9353 END
9354         );
9355     }
9356
9357
9358     # Add any explicit cjk values
9359     $file->insert_lines(@cjk_property_values);
9360
9361     # This line is used only for testing the code that checks for name
9362     # conflicts.  There is a script Inherited, and when this line is executed
9363     # it causes there to be a name conflict with the 'Inherited' that this
9364     # program generates for this block property value
9365     #$file->insert_lines('blk; n/a; Herited');
9366
9367
9368     # Process each line of the file ...
9369     while ($file->next_line) {
9370
9371         # Fix typo in input file
9372         s/CCC133/CCC132/g if $v_version eq v6.1.0;
9373
9374         my ($property, @data) = split /\s*;\s*/;
9375
9376         # The ccc property has an extra field at the beginning, which is the
9377         # numeric value.  Move it to be after the other two, mnemonic, fields,
9378         # so that those will be used as the property value's names, and the
9379         # number will be an extra alias.  (Rightmost splice removes field 1-2,
9380         # returning them in a slice; left splice inserts that before anything,
9381         # thus shifting the former field 0 to after them.)
9382         splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
9383
9384         # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
9385         # there is no short name, use the full one in element 1
9386         if ($data[0] eq "n/a") {
9387             $data[0] = $data[1];
9388         }
9389         elsif ($data[0] ne $data[1]
9390                && standardize($data[0]) eq standardize($data[1])
9391                && $data[1] !~ /[[:upper:]]/)
9392         {
9393             # Also, there is a bug in the file in which "n/a" is omitted, and
9394             # the two fields are identical except for case, and the full name
9395             # is all lower case.  Copy the "short" name unto the full one to
9396             # give it some upper case.
9397
9398             $data[1] = $data[0];
9399         }
9400
9401         # Earlier releases had the pseudo property 'qc' that should expand to
9402         # the ones that replace it below.
9403         if ($property eq 'qc') {
9404             if (lc $data[0] eq 'y') {
9405                 $file->insert_lines('NFC_QC; Y      ; Yes',
9406                                     'NFD_QC; Y      ; Yes',
9407                                     'NFKC_QC; Y     ; Yes',
9408                                     'NFKD_QC; Y     ; Yes',
9409                                     );
9410             }
9411             elsif (lc $data[0] eq 'n') {
9412                 $file->insert_lines('NFC_QC; N      ; No',
9413                                     'NFD_QC; N      ; No',
9414                                     'NFKC_QC; N     ; No',
9415                                     'NFKD_QC; N     ; No',
9416                                     );
9417             }
9418             elsif (lc $data[0] eq 'm') {
9419                 $file->insert_lines('NFC_QC; M      ; Maybe',
9420                                     'NFKC_QC; M     ; Maybe',
9421                                     );
9422             }
9423             else {
9424                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
9425             }
9426             next;
9427         }
9428
9429         # The first field is the short name, 2nd is the full one.
9430         my $property_object = property_ref($property);
9431         my $table = $property_object->add_match_table($data[0],
9432                                                 Full_Name => $data[1]);
9433
9434         # Start looking for more aliases after these two.
9435         for my $i (2 .. @data - 1) {
9436             $table->add_alias($data[$i]);
9437         }
9438     } # End of looping through the file
9439
9440     # As noted in the comments early in the program, it generates tables for
9441     # the default values for all releases, even those for which the concept
9442     # didn't exist at the time.  Here we add those if missing.
9443     my $age = property_ref('age');
9444     if (defined $age && ! defined $age->table('Unassigned')) {
9445         $age->add_match_table('Unassigned');
9446     }
9447     $block->add_match_table('No_Block') if -e 'Blocks.txt'
9448                                     && ! defined $block->table('No_Block');
9449
9450
9451     # Now set the default mappings of the properties from the file.  This is
9452     # done after the loop because a number of properties have only @missings
9453     # entries in the file, and may not show up until the end.
9454     my @defaults = $file->get_missings;
9455     foreach my $default_ref (@defaults) {
9456         my $default = $default_ref->[0];
9457         my $property = property_ref($default_ref->[1]);
9458         $property->set_default_map($default);
9459     }
9460     return;
9461 }
9462
9463 sub get_old_property_value_aliases () {
9464     # Returns what would be in PropValueAliases.txt if it existed in very old
9465     # versions of Unicode.  It was derived from the one in 3.2, and pared
9466     # down.  An attempt was made to use the existence of files to mean
9467     # inclusion or not of various aliases, but if this was not sufficient,
9468     # using version numbers was resorted to.
9469
9470     my @return = split /\n/, <<'END';
9471 bc ; AN        ; Arabic_Number
9472 bc ; B         ; Paragraph_Separator
9473 bc ; CS        ; Common_Separator
9474 bc ; EN        ; European_Number
9475 bc ; ES        ; European_Separator
9476 bc ; ET        ; European_Terminator
9477 bc ; L         ; Left_To_Right
9478 bc ; ON        ; Other_Neutral
9479 bc ; R         ; Right_To_Left
9480 bc ; WS        ; White_Space
9481
9482 Bidi_M; N; No; F; False
9483 Bidi_M; Y; Yes; T; True
9484
9485 # The standard combining classes are very much different in v1, so only use
9486 # ones that look right (not checked thoroughly)
9487 ccc;   0; NR   ; Not_Reordered
9488 ccc;   1; OV   ; Overlay
9489 ccc;   7; NK   ; Nukta
9490 ccc;   8; KV   ; Kana_Voicing
9491 ccc;   9; VR   ; Virama
9492 ccc; 202; ATBL ; Attached_Below_Left
9493 ccc; 216; ATAR ; Attached_Above_Right
9494 ccc; 218; BL   ; Below_Left
9495 ccc; 220; B    ; Below
9496 ccc; 222; BR   ; Below_Right
9497 ccc; 224; L    ; Left
9498 ccc; 228; AL   ; Above_Left
9499 ccc; 230; A    ; Above
9500 ccc; 232; AR   ; Above_Right
9501 ccc; 234; DA   ; Double_Above
9502
9503 dt ; can       ; canonical
9504 dt ; enc       ; circle
9505 dt ; fin       ; final
9506 dt ; font      ; font
9507 dt ; fra       ; fraction
9508 dt ; init      ; initial
9509 dt ; iso       ; isolated
9510 dt ; med       ; medial
9511 dt ; n/a       ; none
9512 dt ; nb        ; noBreak
9513 dt ; sqr       ; square
9514 dt ; sub       ; sub
9515 dt ; sup       ; super
9516
9517 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
9518 gc ; Cc        ; Control
9519 gc ; Cn        ; Unassigned
9520 gc ; Co        ; Private_Use
9521 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
9522 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
9523 gc ; Ll        ; Lowercase_Letter
9524 gc ; Lm        ; Modifier_Letter
9525 gc ; Lo        ; Other_Letter
9526 gc ; Lu        ; Uppercase_Letter
9527 gc ; M         ; Mark                             # Mc | Me | Mn
9528 gc ; Mc        ; Spacing_Mark
9529 gc ; Mn        ; Nonspacing_Mark
9530 gc ; N         ; Number                           # Nd | Nl | No
9531 gc ; Nd        ; Decimal_Number
9532 gc ; No        ; Other_Number
9533 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
9534 gc ; Pd        ; Dash_Punctuation
9535 gc ; Pe        ; Close_Punctuation
9536 gc ; Po        ; Other_Punctuation
9537 gc ; Ps        ; Open_Punctuation
9538 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
9539 gc ; Sc        ; Currency_Symbol
9540 gc ; Sm        ; Math_Symbol
9541 gc ; So        ; Other_Symbol
9542 gc ; Z         ; Separator                        # Zl | Zp | Zs
9543 gc ; Zl        ; Line_Separator
9544 gc ; Zp        ; Paragraph_Separator
9545 gc ; Zs        ; Space_Separator
9546
9547 nt ; de        ; Decimal
9548 nt ; di        ; Digit
9549 nt ; n/a       ; None
9550 nt ; nu        ; Numeric
9551 END
9552
9553     if (-e 'ArabicShaping.txt') {
9554         push @return, split /\n/, <<'END';
9555 jg ; n/a       ; AIN
9556 jg ; n/a       ; ALEF
9557 jg ; n/a       ; DAL
9558 jg ; n/a       ; GAF
9559 jg ; n/a       ; LAM
9560 jg ; n/a       ; MEEM
9561 jg ; n/a       ; NO_JOINING_GROUP
9562 jg ; n/a       ; NOON
9563 jg ; n/a       ; QAF
9564 jg ; n/a       ; SAD
9565 jg ; n/a       ; SEEN
9566 jg ; n/a       ; TAH
9567 jg ; n/a       ; WAW
9568
9569 jt ; C         ; Join_Causing
9570 jt ; D         ; Dual_Joining
9571 jt ; L         ; Left_Joining
9572 jt ; R         ; Right_Joining
9573 jt ; U         ; Non_Joining
9574 jt ; T         ; Transparent
9575 END
9576         if ($v_version ge v3.0.0) {
9577             push @return, split /\n/, <<'END';
9578 jg ; n/a       ; ALAPH
9579 jg ; n/a       ; BEH
9580 jg ; n/a       ; BETH
9581 jg ; n/a       ; DALATH_RISH
9582 jg ; n/a       ; E
9583 jg ; n/a       ; FEH
9584 jg ; n/a       ; FINAL_SEMKATH
9585 jg ; n/a       ; GAMAL
9586 jg ; n/a       ; HAH
9587 jg ; n/a       ; HAMZA_ON_HEH_GOAL
9588 jg ; n/a       ; HE
9589 jg ; n/a       ; HEH
9590 jg ; n/a       ; HEH_GOAL
9591 jg ; n/a       ; HETH
9592 jg ; n/a       ; KAF
9593 jg ; n/a       ; KAPH
9594 jg ; n/a       ; KNOTTED_HEH
9595 jg ; n/a       ; LAMADH
9596 jg ; n/a       ; MIM
9597 jg ; n/a       ; NUN
9598 jg ; n/a       ; PE
9599 jg ; n/a       ; QAPH
9600 jg ; n/a       ; REH
9601 jg ; n/a       ; REVERSED_PE
9602 jg ; n/a       ; SADHE
9603 jg ; n/a       ; SEMKATH
9604 jg ; n/a       ; SHIN
9605 jg ; n/a       ; SWASH_KAF
9606 jg ; n/a       ; TAW
9607 jg ; n/a       ; TEH_MARBUTA
9608 jg ; n/a       ; TETH
9609 jg ; n/a       ; YEH
9610 jg ; n/a       ; YEH_BARREE
9611 jg ; n/a       ; YEH_WITH_TAIL
9612 jg ; n/a       ; YUDH
9613 jg ; n/a       ; YUDH_HE
9614 jg ; n/a       ; ZAIN
9615 END
9616         }
9617     }
9618
9619
9620     if (-e 'EastAsianWidth.txt') {
9621         push @return, split /\n/, <<'END';
9622 ea ; A         ; Ambiguous
9623 ea ; F         ; Fullwidth
9624 ea ; H         ; Halfwidth
9625 ea ; N         ; Neutral
9626 ea ; Na        ; Narrow
9627 ea ; W         ; Wide
9628 END
9629     }
9630
9631     if (-e 'LineBreak.txt') {
9632         push @return, split /\n/, <<'END';
9633 lb ; AI        ; Ambiguous
9634 lb ; AL        ; Alphabetic
9635 lb ; B2        ; Break_Both
9636 lb ; BA        ; Break_After
9637 lb ; BB        ; Break_Before
9638 lb ; BK        ; Mandatory_Break
9639 lb ; CB        ; Contingent_Break
9640 lb ; CL        ; Close_Punctuation
9641 lb ; CM        ; Combining_Mark
9642 lb ; CR        ; Carriage_Return
9643 lb ; EX        ; Exclamation
9644 lb ; GL        ; Glue
9645 lb ; HY        ; Hyphen
9646 lb ; ID        ; Ideographic
9647 lb ; IN        ; Inseperable
9648 lb ; IS        ; Infix_Numeric
9649 lb ; LF        ; Line_Feed
9650 lb ; NS        ; Nonstarter
9651 lb ; NU        ; Numeric
9652 lb ; OP        ; Open_Punctuation
9653 lb ; PO        ; Postfix_Numeric
9654 lb ; PR        ; Prefix_Numeric
9655 lb ; QU        ; Quotation
9656 lb ; SA        ; Complex_Context
9657 lb ; SG        ; Surrogate
9658 lb ; SP        ; Space
9659 lb ; SY        ; Break_Symbols
9660 lb ; XX        ; Unknown
9661 lb ; ZW        ; ZWSpace
9662 END
9663     }
9664
9665     if (-e 'DNormalizationProps.txt') {
9666         push @return, split /\n/, <<'END';
9667 qc ; M         ; Maybe
9668 qc ; N         ; No
9669 qc ; Y         ; Yes
9670 END
9671     }
9672
9673     if (-e 'Scripts.txt') {
9674         push @return, split /\n/, <<'END';
9675 sc ; Arab      ; Arabic
9676 sc ; Armn      ; Armenian
9677 sc ; Beng      ; Bengali
9678 sc ; Bopo      ; Bopomofo
9679 sc ; Cans      ; Canadian_Aboriginal
9680 sc ; Cher      ; Cherokee
9681 sc ; Cyrl      ; Cyrillic
9682 sc ; Deva      ; Devanagari
9683 sc ; Dsrt      ; Deseret
9684 sc ; Ethi      ; Ethiopic
9685 sc ; Geor      ; Georgian
9686 sc ; Goth      ; Gothic
9687 sc ; Grek      ; Greek
9688 sc ; Gujr      ; Gujarati
9689 sc ; Guru      ; Gurmukhi
9690 sc ; Hang      ; Hangul
9691 sc ; Hani      ; Han
9692 sc ; Hebr      ; Hebrew
9693 sc ; Hira      ; Hiragana
9694 sc ; Ital      ; Old_Italic
9695 sc ; Kana      ; Katakana
9696 sc ; Khmr      ; Khmer
9697 sc ; Knda      ; Kannada
9698 sc ; Laoo      ; Lao
9699 sc ; Latn      ; Latin
9700 sc ; Mlym      ; Malayalam
9701 sc ; Mong      ; Mongolian
9702 sc ; Mymr      ; Myanmar
9703 sc ; Ogam      ; Ogham
9704 sc ; Orya      ; Oriya
9705 sc ; Qaai      ; Inherited
9706 sc ; Runr      ; Runic
9707 sc ; Sinh      ; Sinhala
9708 sc ; Syrc      ; Syriac
9709 sc ; Taml      ; Tamil
9710 sc ; Telu      ; Telugu
9711 sc ; Thaa      ; Thaana
9712 sc ; Thai      ; Thai
9713 sc ; Tibt      ; Tibetan
9714 sc ; Yiii      ; Yi
9715 sc ; Zyyy      ; Common
9716 END
9717     }
9718
9719     if ($v_version ge v2.0.0) {
9720         push @return, split /\n/, <<'END';
9721 dt ; com       ; compat
9722 dt ; nar       ; narrow
9723 dt ; sml       ; small
9724 dt ; vert      ; vertical
9725 dt ; wide      ; wide
9726
9727 gc ; Cf        ; Format
9728 gc ; Cs        ; Surrogate
9729 gc ; Lt        ; Titlecase_Letter
9730 gc ; Me        ; Enclosing_Mark
9731 gc ; Nl        ; Letter_Number
9732 gc ; Pc        ; Connector_Punctuation
9733 gc ; Sk        ; Modifier_Symbol
9734 END
9735     }
9736     if ($v_version ge v2.1.2) {
9737         push @return, "bc ; S         ; Segment_Separator\n";
9738     }
9739     if ($v_version ge v2.1.5) {
9740         push @return, split /\n/, <<'END';
9741 gc ; Pf        ; Final_Punctuation
9742 gc ; Pi        ; Initial_Punctuation
9743 END
9744     }
9745     if ($v_version ge v2.1.8) {
9746         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
9747     }
9748
9749     if ($v_version ge v3.0.0) {
9750         push @return, split /\n/, <<'END';
9751 bc ; AL        ; Arabic_Letter
9752 bc ; BN        ; Boundary_Neutral
9753 bc ; LRE       ; Left_To_Right_Embedding
9754 bc ; LRO       ; Left_To_Right_Override
9755 bc ; NSM       ; Nonspacing_Mark
9756 bc ; PDF       ; Pop_Directional_Format
9757 bc ; RLE       ; Right_To_Left_Embedding
9758 bc ; RLO       ; Right_To_Left_Override
9759
9760 ccc; 233; DB   ; Double_Below
9761 END
9762     }
9763
9764     if ($v_version ge v3.1.0) {
9765         push @return, "ccc; 226; R    ; Right\n";
9766     }
9767
9768     return @return;
9769 }
9770
9771 sub process_NormalizationsTest {
9772
9773     # Each line looks like:
9774     #      source code point; NFC; NFD; NFKC; NFKD
9775     # e.g.
9776     #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
9777
9778     my $file= shift;
9779     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9780
9781     # Process each line of the file ...
9782     while ($file->next_line) {
9783
9784         next if /^@/;
9785
9786         my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
9787
9788         foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
9789             $$var = pack "U0U*", map { hex } split " ", $$var;
9790             $$var =~ s/(\\)/$1$1/g;
9791         }
9792
9793         push @normalization_tests,
9794                 "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
9795     } # End of looping through the file
9796 }
9797
9798 sub output_perl_charnames_line ($$) {
9799
9800     # Output the entries in Perl_charnames specially, using 5 digits instead
9801     # of four.  This makes the entries a constant length, and simplifies
9802     # charnames.pm which this table is for.  Unicode can have 6 digit
9803     # ordinals, but they are all private use or noncharacters which do not
9804     # have names, so won't be in this table.
9805
9806     return sprintf "%05X\t%s\n", $_[0], $_[1];
9807 }
9808
9809 { # Closure
9810     # This is used to store the range list of all the code points usable when
9811     # the little used $compare_versions feature is enabled.
9812     my $compare_versions_range_list;
9813
9814     # These are constants to the $property_info hash in this subroutine, to
9815     # avoid using a quoted-string which might have a typo.
9816     my $TYPE  = 'type';
9817     my $DEFAULT_MAP = 'default_map';
9818     my $DEFAULT_TABLE = 'default_table';
9819     my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
9820     my $MISSINGS = 'missings';
9821
9822     sub process_generic_property_file {
9823         # This processes a file containing property mappings and puts them
9824         # into internal map tables.  It should be used to handle any property
9825         # files that have mappings from a code point or range thereof to
9826         # something else.  This means almost all the UCD .txt files.
9827         # each_line_handlers() should be set to adjust the lines of these
9828         # files, if necessary, to what this routine understands:
9829         #
9830         # 0374          ; NFD_QC; N
9831         # 003C..003E    ; Math
9832         #
9833         # the fields are: "codepoint-range ; property; map"
9834         #
9835         # meaning the codepoints in the range all have the value 'map' under
9836         # 'property'.
9837         # Beginning and trailing white space in each field are not significant.
9838         # Note there is not a trailing semi-colon in the above.  A trailing
9839         # semi-colon means the map is a null-string.  An omitted map, as
9840         # opposed to a null-string, is assumed to be 'Y', based on Unicode
9841         # table syntax.  (This could have been hidden from this routine by
9842         # doing it in the $file object, but that would require parsing of the
9843         # line there, so would have to parse it twice, or change the interface
9844         # to pass this an array.  So not done.)
9845         #
9846         # The map field may begin with a sequence of commands that apply to
9847         # this range.  Each such command begins and ends with $CMD_DELIM.
9848         # These are used to indicate, for example, that the mapping for a
9849         # range has a non-default type.
9850         #
9851         # This loops through the file, calling it's next_line() method, and
9852         # then taking the map and adding it to the property's table.
9853         # Complications arise because any number of properties can be in the
9854         # file, in any order, interspersed in any way.  The first time a
9855         # property is seen, it gets information about that property and
9856         # caches it for quick retrieval later.  It also normalizes the maps
9857         # so that only one of many synonyms is stored.  The Unicode input
9858         # files do use some multiple synonyms.
9859
9860         my $file = shift;
9861         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9862
9863         my %property_info;               # To keep track of what properties
9864                                          # have already had entries in the
9865                                          # current file, and info about each,
9866                                          # so don't have to recompute.
9867         my $property_name;               # property currently being worked on
9868         my $property_type;               # and its type
9869         my $previous_property_name = ""; # name from last time through loop
9870         my $property_object;             # pointer to the current property's
9871                                          # object
9872         my $property_addr;               # the address of that object
9873         my $default_map;                 # the string that code points missing
9874                                          # from the file map to
9875         my $default_table;               # For non-string properties, a
9876                                          # reference to the match table that
9877                                          # will contain the list of code
9878                                          # points that map to $default_map.
9879
9880         # Get the next real non-comment line
9881         LINE:
9882         while ($file->next_line) {
9883
9884             # Default replacement type; means that if parts of the range have
9885             # already been stored in our tables, the new map overrides them if
9886             # they differ more than cosmetically
9887             my $replace = $IF_NOT_EQUIVALENT;
9888             my $map_type;            # Default type for the map of this range
9889
9890             #local $to_trace = 1 if main::DEBUG;
9891             trace $_ if main::DEBUG && $to_trace;
9892
9893             # Split the line into components
9894             my ($range, $property_name, $map, @remainder)
9895                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9896
9897             # If more or less on the line than we are expecting, warn and skip
9898             # the line
9899             if (@remainder) {
9900                 $file->carp_bad_line('Extra fields');
9901                 next LINE;
9902             }
9903             elsif ( ! defined $property_name) {
9904                 $file->carp_bad_line('Missing property');
9905                 next LINE;
9906             }
9907
9908             # Examine the range.
9909             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9910             {
9911                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9912                 next LINE;
9913             }
9914             my $low = hex $1;
9915             my $high = (defined $2) ? hex $2 : $low;
9916
9917             # For the very specialized case of comparing two Unicode
9918             # versions...
9919             if (DEBUG && $compare_versions) {
9920                 if ($property_name eq 'Age') {
9921
9922                     # Only allow code points at least as old as the version
9923                     # specified.
9924                     my $age = pack "C*", split(/\./, $map);        # v string
9925                     next LINE if $age gt $compare_versions;
9926                 }
9927                 else {
9928
9929                     # Again, we throw out code points younger than those of
9930                     # the specified version.  By now, the Age property is
9931                     # populated.  We use the intersection of each input range
9932                     # with this property to find what code points in it are
9933                     # valid.   To do the intersection, we have to convert the
9934                     # Age property map to a Range_list.  We only have to do
9935                     # this once.
9936                     if (! defined $compare_versions_range_list) {
9937                         my $age = property_ref('Age');
9938                         if (! -e 'DAge.txt') {
9939                             croak "Need to have 'DAge.txt' file to do version comparison";
9940                         }
9941                         elsif ($age->count == 0) {
9942                             croak "The 'Age' table is empty, but its file exists";
9943                         }
9944                         $compare_versions_range_list
9945                                         = Range_List->new(Initialize => $age);
9946                     }
9947
9948                     # An undefined map is always 'Y'
9949                     $map = 'Y' if ! defined $map;
9950
9951                     # Calculate the intersection of the input range with the
9952                     # code points that are known in the specified version
9953                     my @ranges = ($compare_versions_range_list
9954                                   & Range->new($low, $high))->ranges;
9955
9956                     # If the intersection is empty, throw away this range
9957                     next LINE unless @ranges;
9958
9959                     # Only examine the first range this time through the loop.
9960                     my $this_range = shift @ranges;
9961
9962                     # Put any remaining ranges in the queue to be processed
9963                     # later.  Note that there is unnecessary work here, as we
9964                     # will do the intersection again for each of these ranges
9965                     # during some future iteration of the LINE loop, but this
9966                     # code is not used in production.  The later intersections
9967                     # are guaranteed to not splinter, so this will not become
9968                     # an infinite loop.
9969                     my $line = join ';', $property_name, $map;
9970                     foreach my $range (@ranges) {
9971                         $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9972                                                             $range->start,
9973                                                             $range->end,
9974                                                             $line));
9975                     }
9976
9977                     # And process the first range, like any other.
9978                     $low = $this_range->start;
9979                     $high = $this_range->end;
9980                 }
9981             } # End of $compare_versions
9982
9983             # If changing to a new property, get the things constant per
9984             # property
9985             if ($previous_property_name ne $property_name) {
9986
9987                 $property_object = property_ref($property_name);
9988                 if (! defined $property_object) {
9989                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
9990                     next LINE;
9991                 }
9992                 { no overloading; $property_addr = pack 'J', $property_object; }
9993
9994                 # Defer changing names until have a line that is acceptable
9995                 # (the 'next' statement above means is unacceptable)
9996                 $previous_property_name = $property_name;
9997
9998                 # If not the first time for this property, retrieve info about
9999                 # it from the cache
10000                 if (defined ($property_info{$property_addr}{$TYPE})) {
10001                     $property_type = $property_info{$property_addr}{$TYPE};
10002                     $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
10003                     $map_type
10004                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
10005                     $default_table
10006                             = $property_info{$property_addr}{$DEFAULT_TABLE};
10007                 }
10008                 else {
10009
10010                     # Here, is the first time for this property.  Set up the
10011                     # cache.
10012                     $property_type = $property_info{$property_addr}{$TYPE}
10013                                    = $property_object->type;
10014                     $map_type
10015                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
10016                         = $property_object->pseudo_map_type;
10017
10018                     # The Unicode files are set up so that if the map is not
10019                     # defined, it is a binary property
10020                     if (! defined $map && $property_type != $BINARY) {
10021                         if ($property_type != $UNKNOWN
10022                             && $property_type != $NON_STRING)
10023                         {
10024                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
10025                         }
10026                         else {
10027                             $property_object->set_type($BINARY);
10028                             $property_type
10029                                 = $property_info{$property_addr}{$TYPE}
10030                                 = $BINARY;
10031                         }
10032                     }
10033
10034                     # Get any @missings default for this property.  This
10035                     # should precede the first entry for the property in the
10036                     # input file, and is located in a comment that has been
10037                     # stored by the Input_file class until we access it here.
10038                     # It's possible that there is more than one such line
10039                     # waiting for us; collect them all, and parse
10040                     my @missings_list = $file->get_missings
10041                                             if $file->has_missings_defaults;
10042                     foreach my $default_ref (@missings_list) {
10043                         my $default = $default_ref->[0];
10044                         my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
10045
10046                         # For string properties, the default is just what the
10047                         # file says, but non-string properties should already
10048                         # have set up a table for the default property value;
10049                         # use the table for these, so can resolve synonyms
10050                         # later to a single standard one.
10051                         if ($property_type == $STRING
10052                             || $property_type == $UNKNOWN)
10053                         {
10054                             $property_info{$addr}{$MISSINGS} = $default;
10055                         }
10056                         else {
10057                             $property_info{$addr}{$MISSINGS}
10058                                         = $property_object->table($default);
10059                         }
10060                     }
10061
10062                     # Finished storing all the @missings defaults in the input
10063                     # file so far.  Get the one for the current property.
10064                     my $missings = $property_info{$property_addr}{$MISSINGS};
10065
10066                     # But we likely have separately stored what the default
10067                     # should be.  (This is to accommodate versions of the
10068                     # standard where the @missings lines are absent or
10069                     # incomplete.)  Hopefully the two will match.  But check
10070                     # it out.
10071                     $default_map = $property_object->default_map;
10072
10073                     # If the map is a ref, it means that the default won't be
10074                     # processed until later, so undef it, so next few lines
10075                     # will redefine it to something that nothing will match
10076                     undef $default_map if ref $default_map;
10077
10078                     # Create a $default_map if don't have one; maybe a dummy
10079                     # that won't match anything.
10080                     if (! defined $default_map) {
10081
10082                         # Use any @missings line in the file.
10083                         if (defined $missings) {
10084                             if (ref $missings) {
10085                                 $default_map = $missings->full_name;
10086                                 $default_table = $missings;
10087                             }
10088                             else {
10089                                 $default_map = $missings;
10090                             }
10091
10092                             # And store it with the property for outside use.
10093                             $property_object->set_default_map($default_map);
10094                         }
10095                         else {
10096
10097                             # Neither an @missings nor a default map.  Create
10098                             # a dummy one, so won't have to test definedness
10099                             # in the main loop.
10100                             $default_map = '_Perl This will never be in a file
10101                                             from Unicode';
10102                         }
10103                     }
10104
10105                     # Here, we have $default_map defined, possibly in terms of
10106                     # $missings, but maybe not, and possibly is a dummy one.
10107                     if (defined $missings) {
10108
10109                         # Make sure there is no conflict between the two.
10110                         # $missings has priority.
10111                         if (ref $missings) {
10112                             $default_table
10113                                         = $property_object->table($default_map);
10114                             if (! defined $default_table
10115                                 || $default_table != $missings)
10116                             {
10117                                 if (! defined $default_table) {
10118                                     $default_table = $UNDEF;
10119                                 }
10120                                 $file->carp_bad_line(<<END
10121 The \@missings line for $property_name in $file says that missings default to
10122 $missings, but we expect it to be $default_table.  $missings used.
10123 END
10124                                 );
10125                                 $default_table = $missings;
10126                                 $default_map = $missings->full_name;
10127                             }
10128                             $property_info{$property_addr}{$DEFAULT_TABLE}
10129                                                         = $default_table;
10130                         }
10131                         elsif ($default_map ne $missings) {
10132                             $file->carp_bad_line(<<END
10133 The \@missings line for $property_name in $file says that missings default to
10134 $missings, but we expect it to be $default_map.  $missings used.
10135 END
10136                             );
10137                             $default_map = $missings;
10138                         }
10139                     }
10140
10141                     $property_info{$property_addr}{$DEFAULT_MAP}
10142                                                     = $default_map;
10143
10144                     # If haven't done so already, find the table corresponding
10145                     # to this map for non-string properties.
10146                     if (! defined $default_table
10147                         && $property_type != $STRING
10148                         && $property_type != $UNKNOWN)
10149                     {
10150                         $default_table = $property_info{$property_addr}
10151                                                         {$DEFAULT_TABLE}
10152                                     = $property_object->table($default_map);
10153                     }
10154                 } # End of is first time for this property
10155             } # End of switching properties.
10156
10157             # Ready to process the line.
10158             # The Unicode files are set up so that if the map is not defined,
10159             # it is a binary property with value 'Y'
10160             if (! defined $map) {
10161                 $map = 'Y';
10162             }
10163             else {
10164
10165                 # If the map begins with a special command to us (enclosed in
10166                 # delimiters), extract the command(s).
10167                 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
10168                     my $command = $1;
10169                     if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
10170                         $replace = $1;
10171                     }
10172                     elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
10173                         $map_type = $1;
10174                     }
10175                     else {
10176                         $file->carp_bad_line("Unknown command line: '$1'");
10177                         next LINE;
10178                     }
10179                 }
10180             }
10181
10182             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
10183             {
10184
10185                 # Here, we have a map to a particular code point, and the
10186                 # default map is to a code point itself.  If the range
10187                 # includes the particular code point, change that portion of
10188                 # the range to the default.  This makes sure that in the final
10189                 # table only the non-defaults are listed.
10190                 my $decimal_map = hex $map;
10191                 if ($low <= $decimal_map && $decimal_map <= $high) {
10192
10193                     # If the range includes stuff before or after the map
10194                     # we're changing, split it and process the split-off parts
10195                     # later.
10196                     if ($low < $decimal_map) {
10197                         $file->insert_adjusted_lines(
10198                                             sprintf("%04X..%04X; %s; %s",
10199                                                     $low,
10200                                                     $decimal_map - 1,
10201                                                     $property_name,
10202                                                     $map));
10203                     }
10204                     if ($high > $decimal_map) {
10205                         $file->insert_adjusted_lines(
10206                                             sprintf("%04X..%04X; %s; %s",
10207                                                     $decimal_map + 1,
10208                                                     $high,
10209                                                     $property_name,
10210                                                     $map));
10211                     }
10212                     $low = $high = $decimal_map;
10213                     $map = $CODE_POINT;
10214                 }
10215             }
10216
10217             # If we can tell that this is a synonym for the default map, use
10218             # the default one instead.
10219             if ($property_type != $STRING
10220                 && $property_type != $UNKNOWN)
10221             {
10222                 my $table = $property_object->table($map);
10223                 if (defined $table && $table == $default_table) {
10224                     $map = $default_map;
10225                 }
10226             }
10227
10228             # And figure out the map type if not known.
10229             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
10230                 if ($map eq "") {   # Nulls are always $NULL map type
10231                     $map_type = $NULL;
10232                 } # Otherwise, non-strings, and those that don't allow
10233                   # $MULTI_CP, and those that aren't multiple code points are
10234                   # 0
10235                 elsif
10236                    (($property_type != $STRING && $property_type != $UNKNOWN)
10237                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
10238                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
10239                 {
10240                     $map_type = 0;
10241                 }
10242                 else {
10243                     $map_type = $MULTI_CP;
10244                 }
10245             }
10246
10247             $property_object->add_map($low, $high,
10248                                         $map,
10249                                         Type => $map_type,
10250                                         Replace => $replace);
10251         } # End of loop through file's lines
10252
10253         return;
10254     }
10255 }
10256
10257 { # Closure for UnicodeData.txt handling
10258
10259     # This file was the first one in the UCD; its design leads to some
10260     # awkwardness in processing.  Here is a sample line:
10261     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
10262     # The fields in order are:
10263     my $i = 0;            # The code point is in field 0, and is shifted off.
10264     my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
10265     my $CATEGORY = $i++;  # category (e.g. "Lu")
10266     my $CCC = $i++;       # Canonical combining class (e.g. "230")
10267     my $BIDI = $i++;      # directional class (e.g. "L")
10268     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
10269     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
10270     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
10271                                          # Dual-use in this program; see below
10272     my $NUMERIC = $i++;   # numeric value
10273     my $MIRRORED = $i++;  # ? mirrored
10274     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
10275     my $COMMENT = $i++;   # iso comment
10276     my $UPPER = $i++;     # simple uppercase mapping
10277     my $LOWER = $i++;     # simple lowercase mapping
10278     my $TITLE = $i++;     # simple titlecase mapping
10279     my $input_field_count = $i;
10280
10281     # This routine in addition outputs these extra fields:
10282
10283     my $DECOMP_TYPE = $i++; # Decomposition type
10284
10285     # These fields are modifications of ones above, and are usually
10286     # suppressed; they must come last, as for speed, the loop upper bound is
10287     # normally set to ignore them
10288     my $NAME = $i++;        # This is the strict name field, not the one that
10289                             # charnames uses.
10290     my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
10291                             # by Unicode::Normalize
10292     my $last_field = $i - 1;
10293
10294     # All these are read into an array for each line, with the indices defined
10295     # above.  The empty fields in the example line above indicate that the
10296     # value is defaulted.  The handler called for each line of the input
10297     # changes these to their defaults.
10298
10299     # Here are the official names of the properties, in a parallel array:
10300     my @field_names;
10301     $field_names[$BIDI] = 'Bidi_Class';
10302     $field_names[$CATEGORY] = 'General_Category';
10303     $field_names[$CCC] = 'Canonical_Combining_Class';
10304     $field_names[$CHARNAME] = 'Perl_Charnames';
10305     $field_names[$COMMENT] = 'ISO_Comment';
10306     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
10307     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
10308     $field_names[$LOWER] = 'Lowercase_Mapping';
10309     $field_names[$MIRRORED] = 'Bidi_Mirrored';
10310     $field_names[$NAME] = 'Name';
10311     $field_names[$NUMERIC] = 'Numeric_Value';
10312     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
10313     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
10314     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
10315     $field_names[$TITLE] = 'Titlecase_Mapping';
10316     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
10317     $field_names[$UPPER] = 'Uppercase_Mapping';
10318
10319     # Some of these need a little more explanation:
10320     # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
10321     #   property, but is used in calculating the Numeric_Type.  Perl however,
10322     #   creates a file from this field, so a Perl property is created from it.
10323     # Similarly, the Other_Digit field is used only for calculating the
10324     #   Numeric_Type, and so it can be safely re-used as the place to store
10325     #   the value for Numeric_Type; hence it is referred to as
10326     #   $NUMERIC_TYPE_OTHER_DIGIT.
10327     # The input field named $PERL_DECOMPOSITION is a combination of both the
10328     #   decomposition mapping and its type.  Perl creates a file containing
10329     #   exactly this field, so it is used for that.  The two properties are
10330     #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
10331     #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
10332     #   output it), as Perl doesn't use it directly.
10333     # The input field named here $CHARNAME is used to construct the
10334     #   Perl_Charnames property, which is a combination of the Name property
10335     #   (which the input field contains), and the Unicode_1_Name property, and
10336     #   others from other files.  Since, the strict Name property is not used
10337     #   by Perl, this field is used for the table that Perl does use.  The
10338     #   strict Name property table is usually suppressed (unless the lists are
10339     #   changed to output it), so it is accumulated in a separate field,
10340     #   $NAME, which to save time is discarded unless the table is actually to
10341     #   be output
10342
10343     # This file is processed like most in this program.  Control is passed to
10344     # process_generic_property_file() which calls filter_UnicodeData_line()
10345     # for each input line.  This filter converts the input into line(s) that
10346     # process_generic_property_file() understands.  There is also a setup
10347     # routine called before any of the file is processed, and a handler for
10348     # EOF processing, all in this closure.
10349
10350     # A huge speed-up occurred at the cost of some added complexity when these
10351     # routines were altered to buffer the outputs into ranges.  Almost all the
10352     # lines of the input file apply to just one code point, and for most
10353     # properties, the map for the next code point up is the same as the
10354     # current one.  So instead of creating a line for each property for each
10355     # input line, filter_UnicodeData_line() remembers what the previous map
10356     # of a property was, and doesn't generate a line to pass on until it has
10357     # to, as when the map changes; and that passed-on line encompasses the
10358     # whole contiguous range of code points that have the same map for that
10359     # property.  This means a slight amount of extra setup, and having to
10360     # flush these buffers on EOF, testing if the maps have changed, plus
10361     # remembering state information in the closure.  But it means a lot less
10362     # real time in not having to change the data base for each property on
10363     # each line.
10364
10365     # Another complication is that there are already a few ranges designated
10366     # in the input.  There are two lines for each, with the same maps except
10367     # the code point and name on each line.  This was actually the hardest
10368     # thing to design around.  The code points in those ranges may actually
10369     # have real maps not given by these two lines.  These maps will either
10370     # be algorithmically determinable, or be in the extracted files furnished
10371     # with the UCD.  In the event of conflicts between these extracted files,
10372     # and this one, Unicode says that this one prevails.  But it shouldn't
10373     # prevail for conflicts that occur in these ranges.  The data from the
10374     # extracted files prevails in those cases.  So, this program is structured
10375     # so that those files are processed first, storing maps.  Then the other
10376     # files are processed, generally overwriting what the extracted files
10377     # stored.  But just the range lines in this input file are processed
10378     # without overwriting.  This is accomplished by adding a special string to
10379     # the lines output to tell process_generic_property_file() to turn off the
10380     # overwriting for just this one line.
10381     # A similar mechanism is used to tell it that the map is of a non-default
10382     # type.
10383
10384     sub setup_UnicodeData { # Called before any lines of the input are read
10385         my $file = shift;
10386         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10387
10388         # Create a new property specially located that is a combination of the
10389         # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10390         # Name_Alias properties.  (The final duplicates elements of the
10391         # first.)  A comment for it will later be constructed based on the
10392         # actual properties present and used
10393         $perl_charname = Property->new('Perl_Charnames',
10394                        Default_Map => "",
10395                        Directory => File::Spec->curdir(),
10396                        File => 'Name',
10397                        Fate => $INTERNAL_ONLY,
10398                        Perl_Extension => 1,
10399                        Range_Size_1 => \&output_perl_charnames_line,
10400                        Type => $STRING,
10401                        );
10402         $perl_charname->set_proxy_for('Name');
10403
10404         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
10405                                         Directory => File::Spec->curdir(),
10406                                         File => 'Decomposition',
10407                                         Format => $DECOMP_STRING_FORMAT,
10408                                         Fate => $INTERNAL_ONLY,
10409                                         Perl_Extension => 1,
10410                                         Default_Map => $CODE_POINT,
10411
10412                                         # normalize.pm can't cope with these
10413                                         Output_Range_Counts => 0,
10414
10415                                         # This is a specially formatted table
10416                                         # explicitly for normalize.pm, which
10417                                         # is expecting a particular format,
10418                                         # which means that mappings containing
10419                                         # multiple code points are in the main
10420                                         # body of the table
10421                                         Map_Type => $COMPUTE_NO_MULTI_CP,
10422                                         Type => $STRING,
10423                                         To_Output_Map => $INTERNAL_MAP,
10424                                         );
10425         $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
10426         $Perl_decomp->add_comment(join_lines(<<END
10427 This mapping is a combination of the Unicode 'Decomposition_Type' and
10428 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
10429 identical to the official Unicode 'Decomposition_Mapping' property except for
10430 two things:
10431  1) It omits the algorithmically determinable Hangul syllable decompositions,
10432 which normalize.pm handles algorithmically.
10433  2) It contains the decomposition type as well.  Non-canonical decompositions
10434 begin with a word in angle brackets, like <super>, which denotes the
10435 compatible decomposition type.  If the map does not begin with the <angle
10436 brackets>, the decomposition is canonical.
10437 END
10438         ));
10439
10440         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
10441                                         Default_Map => "",
10442                                         Perl_Extension => 1,
10443                                         Directory => $map_directory,
10444                                         Type => $STRING,
10445                                         To_Output_Map => $OUTPUT_ADJUSTED,
10446                                         );
10447         $Decimal_Digit->add_comment(join_lines(<<END
10448 This file gives the mapping of all code points which represent a single
10449 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
10450 points, and the mapping of each non-initial element of each range is actually
10451 not to "0", but to the offset that element has from its corresponding DIGIT 0.
10452 These code points are those that have Numeric_Type=Decimal; not special
10453 things, like subscripts nor Roman numerals.
10454 END
10455         ));
10456
10457         # These properties are not used for generating anything else, and are
10458         # usually not output.  By making them last in the list, we can just
10459         # change the high end of the loop downwards to avoid the work of
10460         # generating a table(s) that is/are just going to get thrown away.
10461         if (! property_ref('Decomposition_Mapping')->to_output_map
10462             && ! property_ref('Name')->to_output_map)
10463         {
10464             $last_field = min($NAME, $DECOMP_MAP) - 1;
10465         } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
10466             $last_field = $DECOMP_MAP;
10467         } elsif (property_ref('Name')->to_output_map) {
10468             $last_field = $NAME;
10469         }
10470         return;
10471     }
10472
10473     my $first_time = 1;                 # ? Is this the first line of the file
10474     my $in_range = 0;                   # ? Are we in one of the file's ranges
10475     my $previous_cp;                    # hex code point of previous line
10476     my $decimal_previous_cp = -1;       # And its decimal equivalent
10477     my @start;                          # For each field, the current starting
10478                                         # code point in hex for the range
10479                                         # being accumulated.
10480     my @fields;                         # The input fields;
10481     my @previous_fields;                # And those from the previous call
10482
10483     sub filter_UnicodeData_line {
10484         # Handle a single input line from UnicodeData.txt; see comments above
10485         # Conceptually this takes a single line from the file containing N
10486         # properties, and converts it into N lines with one property per line,
10487         # which is what the final handler expects.  But there are
10488         # complications due to the quirkiness of the input file, and to save
10489         # time, it accumulates ranges where the property values don't change
10490         # and only emits lines when necessary.  This is about an order of
10491         # magnitude fewer lines emitted.
10492
10493         my $file = shift;
10494         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10495
10496         # $_ contains the input line.
10497         # -1 in split means retain trailing null fields
10498         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10499
10500         #local $to_trace = 1 if main::DEBUG;
10501         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10502         if (@fields > $input_field_count) {
10503             $file->carp_bad_line('Extra fields');
10504             $_ = "";
10505             return;
10506         }
10507
10508         my $decimal_cp = hex $cp;
10509
10510         # We have to output all the buffered ranges when the next code point
10511         # is not exactly one after the previous one, which means there is a
10512         # gap in the ranges.
10513         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10514
10515         # The decomposition mapping field requires special handling.  It looks
10516         # like either:
10517         #
10518         # <compat> 0032 0020
10519         # 0041 0300
10520         #
10521         # The decomposition type is enclosed in <brackets>; if missing, it
10522         # means the type is canonical.  There are two decomposition mapping
10523         # tables: the one for use by Perl's normalize.pm has a special format
10524         # which is this field intact; the other, for general use is of
10525         # standard format.  In either case we have to find the decomposition
10526         # type.  Empty fields have None as their type, and map to the code
10527         # point itself
10528         if ($fields[$PERL_DECOMPOSITION] eq "") {
10529             $fields[$DECOMP_TYPE] = 'None';
10530             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10531         }
10532         else {
10533             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10534                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
10535             if (! defined $fields[$DECOMP_TYPE]) {
10536                 $fields[$DECOMP_TYPE] = 'Canonical';
10537                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10538             }
10539             else {
10540                 $fields[$DECOMP_MAP] = $map;
10541             }
10542         }
10543
10544         # The 3 numeric fields also require special handling.  The 2 digit
10545         # fields must be either empty or match the number field.  This means
10546         # that if it is empty, they must be as well, and the numeric type is
10547         # None, and the numeric value is 'Nan'.
10548         # The decimal digit field must be empty or match the other digit
10549         # field.  If the decimal digit field is non-empty, the code point is
10550         # a decimal digit, and the other two fields will have the same value.
10551         # If it is empty, but the other digit field is non-empty, the code
10552         # point is an 'other digit', and the number field will have the same
10553         # value as the other digit field.  If the other digit field is empty,
10554         # but the number field is non-empty, the code point is a generic
10555         # numeric type.
10556         if ($fields[$NUMERIC] eq "") {
10557             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10558                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10559             ) {
10560                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
10561             }
10562             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10563             $fields[$NUMERIC] = 'NaN';
10564         }
10565         else {
10566             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10567             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10568                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10569                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
10570                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10571             }
10572             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10573                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10574                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10575             }
10576             else {
10577                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10578
10579                 # Rationals require extra effort.
10580                 register_fraction($fields[$NUMERIC])
10581                                                 if $fields[$NUMERIC] =~ qr{/};
10582             }
10583         }
10584
10585         # For the properties that have empty fields in the file, and which
10586         # mean something different from empty, change them to that default.
10587         # Certain fields just haven't been empty so far in any Unicode
10588         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10589         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
10590         # the defaults; which are very unlikely to ever change.
10591         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10592         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10593
10594         # UAX44 says that if title is empty, it is the same as whatever upper
10595         # is,
10596         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10597
10598         # There are a few pairs of lines like:
10599         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10600         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10601         # that define ranges.  These should be processed after the fields are
10602         # adjusted above, as they may override some of them; but mostly what
10603         # is left is to possibly adjust the $CHARNAME field.  The names of all the
10604         # paired lines start with a '<', but this is also true of '<control>,
10605         # which isn't one of these special ones.
10606         if ($fields[$CHARNAME] eq '<control>') {
10607
10608             # Some code points in this file have the pseudo-name
10609             # '<control>', but the official name for such ones is the null
10610             # string.
10611             $fields[$NAME] = $fields[$CHARNAME] = "";
10612
10613             # We had better not be in between range lines.
10614             if ($in_range) {
10615                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10616                 $in_range = 0;
10617             }
10618         }
10619         elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
10620
10621             # Here is a non-range line.  We had better not be in between range
10622             # lines.
10623             if ($in_range) {
10624                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10625                 $in_range = 0;
10626             }
10627             if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10628
10629                 # These are code points whose names end in their code points,
10630                 # which means the names are algorithmically derivable from the
10631                 # code points.  To shorten the output Name file, the algorithm
10632                 # for deriving these is placed in the file instead of each
10633                 # code point, so they have map type $CP_IN_NAME
10634                 $fields[$CHARNAME] = $CMD_DELIM
10635                                  . $MAP_TYPE_CMD
10636                                  . '='
10637                                  . $CP_IN_NAME
10638                                  . $CMD_DELIM
10639                                  . $fields[$CHARNAME];
10640             }
10641             $fields[$NAME] = $fields[$CHARNAME];
10642         }
10643         elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10644             $fields[$CHARNAME] = $fields[$NAME] = $1;
10645
10646             # Here we are at the beginning of a range pair.
10647             if ($in_range) {
10648                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
10649             }
10650             $in_range = 1;
10651
10652             # Because the properties in the range do not overwrite any already
10653             # in the db, we must flush the buffers of what's already there, so
10654             # they get handled in the normal scheme.
10655             $force_output = 1;
10656
10657         }
10658         elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10659             $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
10660             $_ = "";
10661             return;
10662         }
10663         else { # Here, we are at the last line of a range pair.
10664
10665             if (! $in_range) {
10666                 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
10667                 $_ = "";
10668                 return;
10669             }
10670             $in_range = 0;
10671
10672             $fields[$NAME] = $fields[$CHARNAME];
10673
10674             # Check that the input is valid: that the closing of the range is
10675             # the same as the beginning.
10676             foreach my $i (0 .. $last_field) {
10677                 next if $fields[$i] eq $previous_fields[$i];
10678                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
10679             }
10680
10681             # The processing differs depending on the type of range,
10682             # determined by its $CHARNAME
10683             if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
10684
10685                 # Check that the data looks right.
10686                 if ($decimal_previous_cp != $SBase) {
10687                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
10688                 }
10689                 if ($decimal_cp != $SBase + $SCount - 1) {
10690                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
10691                 }
10692
10693                 # The Hangul syllable range has a somewhat complicated name
10694                 # generation algorithm.  Each code point in it has a canonical
10695                 # decomposition also computable by an algorithm.  The
10696                 # perl decomposition map table built from these is used only
10697                 # by normalize.pm, which has the algorithm built in it, so the
10698                 # decomposition maps are not needed, and are large, so are
10699                 # omitted from it.  If the full decomposition map table is to
10700                 # be output, the decompositions are generated for it, in the
10701                 # EOF handling code for this input file.
10702
10703                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10704
10705                 # This range is stored in our internal structure with its
10706                 # own map type, different from all others.
10707                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10708                                         = $CMD_DELIM
10709                                           . $MAP_TYPE_CMD
10710                                           . '='
10711                                           . $HANGUL_SYLLABLE
10712                                           . $CMD_DELIM
10713                                           . $fields[$CHARNAME];
10714             }
10715             elsif ($fields[$CHARNAME] =~ /^CJK/) {
10716
10717                 # The name for these contains the code point itself, and all
10718                 # are defined to have the same base name, regardless of what
10719                 # is in the file.  They are stored in our internal structure
10720                 # with a map type of $CP_IN_NAME
10721                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10722                                         = $CMD_DELIM
10723                                            . $MAP_TYPE_CMD
10724                                            . '='
10725                                            . $CP_IN_NAME
10726                                            . $CMD_DELIM
10727                                            . 'CJK UNIFIED IDEOGRAPH';
10728
10729             }
10730             elsif ($fields[$CATEGORY] eq 'Co'
10731                      || $fields[$CATEGORY] eq 'Cs')
10732             {
10733                 # The names of all the code points in these ranges are set to
10734                 # null, as there are no names for the private use and
10735                 # surrogate code points.
10736
10737                 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
10738             }
10739             else {
10740                 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
10741             }
10742
10743             # The first line of the range caused everything else to be output,
10744             # and then its values were stored as the beginning values for the
10745             # next set of ranges, which this one ends.  Now, for each value,
10746             # add a command to tell the handler that these values should not
10747             # replace any existing ones in our database.
10748             foreach my $i (0 .. $last_field) {
10749                 $previous_fields[$i] = $CMD_DELIM
10750                                         . $REPLACE_CMD
10751                                         . '='
10752                                         . $NO
10753                                         . $CMD_DELIM
10754                                         . $previous_fields[$i];
10755             }
10756
10757             # And change things so it looks like the entire range has been
10758             # gone through with this being the final part of it.  Adding the
10759             # command above to each field will cause this range to be flushed
10760             # during the next iteration, as it guaranteed that the stored
10761             # field won't match whatever value the next one has.
10762             $previous_cp = $cp;
10763             $decimal_previous_cp = $decimal_cp;
10764
10765             # We are now set up for the next iteration; so skip the remaining
10766             # code in this subroutine that does the same thing, but doesn't
10767             # know about these ranges.
10768             $_ = "";
10769
10770             return;
10771         }
10772
10773         # On the very first line, we fake it so the code below thinks there is
10774         # nothing to output, and initialize so that when it does get output it
10775         # uses the first line's values for the lowest part of the range.
10776         # (One could avoid this by using peek(), but then one would need to
10777         # know the adjustments done above and do the same ones in the setup
10778         # routine; not worth it)
10779         if ($first_time) {
10780             $first_time = 0;
10781             @previous_fields = @fields;
10782             @start = ($cp) x scalar @fields;
10783             $decimal_previous_cp = $decimal_cp - 1;
10784         }
10785
10786         # For each field, output the stored up ranges that this code point
10787         # doesn't fit in.  Earlier we figured out if all ranges should be
10788         # terminated because of changing the replace or map type styles, or if
10789         # there is a gap between this new code point and the previous one, and
10790         # that is stored in $force_output.  But even if those aren't true, we
10791         # need to output the range if this new code point's value for the
10792         # given property doesn't match the stored range's.
10793         #local $to_trace = 1 if main::DEBUG;
10794         foreach my $i (0 .. $last_field) {
10795             my $field = $fields[$i];
10796             if ($force_output || $field ne $previous_fields[$i]) {
10797
10798                 # Flush the buffer of stored values.
10799                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10800
10801                 # Start a new range with this code point and its value
10802                 $start[$i] = $cp;
10803                 $previous_fields[$i] = $field;
10804             }
10805         }
10806
10807         # Set the values for the next time.
10808         $previous_cp = $cp;
10809         $decimal_previous_cp = $decimal_cp;
10810
10811         # The input line has generated whatever adjusted lines are needed, and
10812         # should not be looked at further.
10813         $_ = "";
10814         return;
10815     }
10816
10817     sub EOF_UnicodeData {
10818         # Called upon EOF to flush the buffers, and create the Hangul
10819         # decomposition mappings if needed.
10820
10821         my $file = shift;
10822         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10823
10824         # Flush the buffers.
10825         foreach my $i (0 .. $last_field) {
10826             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10827         }
10828
10829         if (-e 'Jamo.txt') {
10830
10831             # The algorithm is published by Unicode, based on values in
10832             # Jamo.txt, (which should have been processed before this
10833             # subroutine), and the results left in %Jamo
10834             unless (%Jamo) {
10835                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
10836                 return;
10837             }
10838
10839             # If the full decomposition map table is being output, insert
10840             # into it the Hangul syllable mappings.  This is to avoid having
10841             # to publish a subroutine in it to compute them.  (which would
10842             # essentially be this code.)  This uses the algorithm published by
10843             # Unicode.  (No hangul syllables in version 1)
10844             if ($v_version ge v2.0.0
10845                 && property_ref('Decomposition_Mapping')->to_output_map) {
10846                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10847                     use integer;
10848                     my $SIndex = $S - $SBase;
10849                     my $L = $LBase + $SIndex / $NCount;
10850                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
10851                     my $T = $TBase + $SIndex % $TCount;
10852
10853                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10854                     my $decomposition = sprintf("%04X %04X", $L, $V);
10855                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10856                     $file->insert_adjusted_lines(
10857                                 sprintf("%04X; Decomposition_Mapping; %s",
10858                                         $S,
10859                                         $decomposition));
10860                 }
10861             }
10862         }
10863
10864         return;
10865     }
10866
10867     sub filter_v1_ucd {
10868         # Fix UCD lines in version 1.  This is probably overkill, but this
10869         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
10870         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
10871         #       removed.  This program retains them
10872         # 2)    didn't include ranges, which it should have, and which are now
10873         #       added in @corrected_lines below.  It was hand populated by
10874         #       taking the data from Version 2, verified by analyzing
10875         #       DAge.txt.
10876         # 3)    There is a syntax error in the entry for U+09F8 which could
10877         #       cause problems for utf8_heavy, and so is changed.  It's
10878         #       numeric value was simply a minus sign, without any number.
10879         #       (Eventually Unicode changed the code point to non-numeric.)
10880         # 4)    The decomposition types often don't match later versions
10881         #       exactly, and the whole syntax of that field is different; so
10882         #       the syntax is changed as well as the types to their later
10883         #       terminology.  Otherwise normalize.pm would be very unhappy
10884         # 5)    Many ccc classes are different.  These are left intact.
10885         # 6)    U+FF10..U+FF19 are missing their numeric values in all three
10886         #       fields.  These are unchanged because it doesn't really cause
10887         #       problems for Perl.
10888         # 7)    A number of code points, such as controls, don't have their
10889         #       Unicode Version 1 Names in this file.  These are added.
10890         # 8)    A number of Symbols were marked as Lm.  This changes those in
10891         #       the Latin1 range, so that regexes work.
10892         # 9)    The odd characters U+03DB .. U+03E1 weren't encoded but are
10893         #       referred to by their lc equivalents.  Not fixed.
10894
10895         my @corrected_lines = split /\n/, <<'END';
10896 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
10897 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10898 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10899 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10900 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10901 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10902 END
10903
10904         my $file = shift;
10905         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10906
10907         #local $to_trace = 1 if main::DEBUG;
10908         trace $_ if main::DEBUG && $to_trace;
10909
10910         # -1 => retain trailing null fields
10911         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10912
10913         # At the first place that is wrong in the input, insert all the
10914         # corrections, replacing the wrong line.
10915         if ($code_point eq '4E00') {
10916             my @copy = @corrected_lines;
10917             $_ = shift @copy;
10918             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10919
10920             $file->insert_lines(@copy);
10921         }
10922         elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
10923
10924             # There are no Lm characters in Latin1; these should be 'Sk', but
10925             # there isn't that in V1.
10926             $fields[$CATEGORY] = 'So';
10927         }
10928
10929         if ($fields[$NUMERIC] eq '-') {
10930             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
10931         }
10932
10933         if  ($fields[$PERL_DECOMPOSITION] ne "") {
10934
10935             # Several entries have this change to superscript 2 or 3 in the
10936             # middle.  Convert these to the modern version, which is to use
10937             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10938             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10939             # 'HHHH HHHH 00B3 HHHH'.
10940             # It turns out that all of these that don't have another
10941             # decomposition defined at the beginning of the line have the
10942             # <square> decomposition in later releases.
10943             if ($code_point ne '00B2' && $code_point ne '00B3') {
10944                 if  ($fields[$PERL_DECOMPOSITION]
10945                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10946                 {
10947                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10948                         $fields[$PERL_DECOMPOSITION] = '<square> '
10949                         . $fields[$PERL_DECOMPOSITION];
10950                     }
10951                 }
10952             }
10953
10954             # If is like '<+circled> 0052 <-circled>', convert to
10955             # '<circled> 0052'
10956             $fields[$PERL_DECOMPOSITION] =~
10957                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
10958
10959             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10960             $fields[$PERL_DECOMPOSITION] =~
10961                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10962             or $fields[$PERL_DECOMPOSITION] =~
10963                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10964             or $fields[$PERL_DECOMPOSITION] =~
10965                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10966             or $fields[$PERL_DECOMPOSITION] =~
10967                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10968
10969             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10970             $fields[$PERL_DECOMPOSITION] =~
10971                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10972
10973             # Change names to modern form.
10974             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10975             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10976             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10977             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10978
10979             # One entry has weird braces
10980             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10981
10982             # One entry at U+2116 has an extra <sup>
10983             $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
10984         }
10985
10986         $_ = join ';', $code_point, @fields;
10987         trace $_ if main::DEBUG && $to_trace;
10988         return;
10989     }
10990
10991     sub filter_bad_Nd_ucd {
10992         # Early versions specified a value in the decimal digit field even
10993         # though the code point wasn't a decimal digit.  Clear the field in
10994         # that situation, so that the main code doesn't think it is a decimal
10995         # digit.
10996
10997         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10998         if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
10999             $fields[$PERL_DECIMAL_DIGIT] = "";
11000             $_ = join ';', $code_point, @fields;
11001         }
11002         return;
11003     }
11004
11005     my @U1_control_names = split /\n/, <<'END';
11006 NULL
11007 START OF HEADING
11008 START OF TEXT
11009 END OF TEXT
11010 END OF TRANSMISSION
11011 ENQUIRY
11012 ACKNOWLEDGE
11013 BELL
11014 BACKSPACE
11015 HORIZONTAL TABULATION
11016 LINE FEED
11017 VERTICAL TABULATION
11018 FORM FEED
11019 CARRIAGE RETURN
11020 SHIFT OUT
11021 SHIFT IN
11022 DATA LINK ESCAPE
11023 DEVICE CONTROL ONE
11024 DEVICE CONTROL TWO
11025 DEVICE CONTROL THREE
11026 DEVICE CONTROL FOUR
11027 NEGATIVE ACKNOWLEDGE
11028 SYNCHRONOUS IDLE
11029 END OF TRANSMISSION BLOCK
11030 CANCEL
11031 END OF MEDIUM
11032 SUBSTITUTE
11033 ESCAPE
11034 FILE SEPARATOR
11035 GROUP SEPARATOR
11036 RECORD SEPARATOR
11037 UNIT SEPARATOR
11038 DELETE
11039 BREAK PERMITTED HERE
11040 NO BREAK HERE
11041 INDEX
11042 NEXT LINE
11043 START OF SELECTED AREA
11044 END OF SELECTED AREA
11045 CHARACTER TABULATION SET
11046 CHARACTER TABULATION WITH JUSTIFICATION
11047 LINE TABULATION SET
11048 PARTIAL LINE DOWN
11049 PARTIAL LINE UP
11050 REVERSE LINE FEED
11051 SINGLE SHIFT TWO
11052 SINGLE SHIFT THREE
11053 DEVICE CONTROL STRING
11054 PRIVATE USE ONE
11055 PRIVATE USE TWO
11056 SET TRANSMIT STATE
11057 CANCEL CHARACTER
11058 MESSAGE WAITING
11059 START OF GUARDED AREA
11060 END OF GUARDED AREA
11061 START OF STRING
11062 SINGLE CHARACTER INTRODUCER
11063 CONTROL SEQUENCE INTRODUCER
11064 STRING TERMINATOR
11065 OPERATING SYSTEM COMMAND
11066 PRIVACY MESSAGE
11067 APPLICATION PROGRAM COMMAND
11068 END
11069
11070     sub filter_early_U1_names {
11071         # Very early versions did not have the Unicode_1_name field specified.
11072         # They differed in which ones were present; make sure a U1 name
11073         # exists, so that Unicode::UCD::charinfo will work
11074
11075         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11076
11077
11078         # @U1_control names above are entirely positional, so we pull them out
11079         # in the exact order required, with gaps for the ones that don't have
11080         # names.
11081         if ($code_point =~ /^00[01]/
11082             || $code_point eq '007F'
11083             || $code_point =~ /^008[2-9A-F]/
11084             || $code_point =~ /^009[0-8A-F]/)
11085         {
11086             my $u1_name = shift @U1_control_names;
11087             $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
11088             $_ = join ';', $code_point, @fields;
11089         }
11090         return;
11091     }
11092
11093     sub filter_v2_1_5_ucd {
11094         # A dozen entries in this 2.1.5 file had the mirrored and numeric
11095         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
11096         # column appears to be N, swap it back.
11097
11098         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11099         if ($fields[$NUMERIC] eq 'N') {
11100             $fields[$NUMERIC] = $fields[$MIRRORED];
11101             $fields[$MIRRORED] = 'N';
11102             $_ = join ';', $code_point, @fields;
11103         }
11104         return;
11105     }
11106
11107     sub filter_v6_ucd {
11108
11109         # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
11110         # it wasn't accepted, to allow for some deprecation cycles.  This
11111         # function is not called after 5.16
11112
11113         return if $_ !~ /^(?:0007|1F514|070F);/;
11114
11115         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11116         if ($code_point eq '0007') {
11117             $fields[$CHARNAME] = "";
11118         }
11119         elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
11120                             # http://www.unicode.org/versions/corrigendum8.html
11121             $fields[$BIDI] = "AL";
11122         }
11123         elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
11124             $fields[$CHARNAME] = "";
11125         }
11126
11127         $_ = join ';', $code_point, @fields;
11128
11129         return;
11130     }
11131 } # End closure for UnicodeData
11132
11133 sub process_GCB_test {
11134
11135     my $file = shift;
11136     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11137
11138     while ($file->next_line) {
11139         push @backslash_X_tests, $_;
11140     }
11141
11142     return;
11143 }
11144
11145 sub process_NamedSequences {
11146     # NamedSequences.txt entries are just added to an array.  Because these
11147     # don't look like the other tables, they have their own handler.
11148     # An example:
11149     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
11150     #
11151     # This just adds the sequence to an array for later handling
11152
11153     my $file = shift;
11154     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11155
11156     while ($file->next_line) {
11157         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
11158         if (@remainder) {
11159             $file->carp_bad_line(
11160                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
11161             next;
11162         }
11163
11164         # Note single \t in keeping with special output format of
11165         # Perl_charnames.  But it turns out that the code points don't have to
11166         # be 5 digits long, like the rest, based on the internal workings of
11167         # charnames.pm.  This could be easily changed for consistency.
11168         push @named_sequences, "$sequence\t$name";
11169     }
11170     return;
11171 }
11172
11173 { # Closure
11174
11175     my $first_range;
11176
11177     sub  filter_early_ea_lb {
11178         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
11179         # third field be the name of the code point, which can be ignored in
11180         # most cases.  But it can be meaningful if it marks a range:
11181         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
11182         # 3400;W;<CJK Ideograph Extension A, First>
11183         #
11184         # We need to see the First in the example above to know it's a range.
11185         # They did not use the later range syntaxes.  This routine changes it
11186         # to use the modern syntax.
11187         # $1 is the Input_file object.
11188
11189         my @fields = split /\s*;\s*/;
11190         if ($fields[2] =~ /^<.*, First>/) {
11191             $first_range = $fields[0];
11192             $_ = "";
11193         }
11194         elsif ($fields[2] =~ /^<.*, Last>/) {
11195             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
11196         }
11197         else {
11198             undef $first_range;
11199             $_ = "$fields[0]; $fields[1]";
11200         }
11201
11202         return;
11203     }
11204 }
11205
11206 sub filter_old_style_arabic_shaping {
11207     # Early versions used a different term for the later one.
11208
11209     my @fields = split /\s*;\s*/;
11210     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
11211     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
11212     $_ = join ';', @fields;
11213     return;
11214 }
11215
11216 sub filter_arabic_shaping_line {
11217     # ArabicShaping.txt has entries that look like:
11218     # 062A; TEH; D; BEH
11219     # The field containing 'TEH' is not used.  The next field is Joining_Type
11220     # and the last is Joining_Group
11221     # This generates two lines to pass on, one for each property on the input
11222     # line.
11223
11224     my $file = shift;
11225     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11226
11227     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11228
11229     if (@fields > 4) {
11230         $file->carp_bad_line('Extra fields');
11231         $_ = "";
11232         return;
11233     }
11234
11235     $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
11236     $_ = "$fields[0]; Joining_Type; $fields[2]";
11237
11238     return;
11239 }
11240
11241 { # Closure
11242     my $lc; # Table for lowercase mapping
11243     my $tc;
11244     my $uc;
11245     my %special_casing_code_points;
11246
11247     sub setup_special_casing {
11248         # SpecialCasing.txt contains the non-simple case change mappings.  The
11249         # simple ones are in UnicodeData.txt, which should already have been
11250         # read in to the full property data structures, so as to initialize
11251         # these with the simple ones.  Then the SpecialCasing.txt entries
11252         # add or overwrite the ones which have different full mappings.
11253
11254         # This routine sees if the simple mappings are to be output, and if
11255         # so, copies what has already been put into the full mapping tables,
11256         # while they still contain only the simple mappings.
11257
11258         # The reason it is done this way is that the simple mappings are
11259         # probably not going to be output, so it saves work to initialize the
11260         # full tables with the simple mappings, and then overwrite those
11261         # relatively few entries in them that have different full mappings,
11262         # and thus skip the simple mapping tables altogether.
11263
11264         my $file= shift;
11265         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11266
11267         $lc = property_ref('lc');
11268         $tc = property_ref('tc');
11269         $uc = property_ref('uc');
11270
11271         # For each of the case change mappings...
11272         foreach my $full_table ($lc, $tc, $uc) {
11273             my $full_name = $full_table->name;
11274             unless (defined $full_table && ! $full_table->is_empty) {
11275                 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
11276             }
11277
11278             # Create a table in the old-style format and with the original
11279             # file name for backwards compatibility with applications that
11280             # read it directly.  The new tables contain both the simple and
11281             # full maps, and the old are missing simple maps when there is a
11282             # conflicting full one.  Probably it would have been ok to add
11283             # those to the legacy version, as was already done in 5.14 to the
11284             # case folding one, but this was not done, out of an abundance of
11285             # caution.  The tables are set up here before we deal with the
11286             # full maps so that as we handle those, we can override the simple
11287             # maps for them in the legacy table, and merely add them in the
11288             # new-style one.
11289             my $legacy = Property->new("Legacy_" . $full_table->full_name,
11290                                         File => $full_table->full_name =~
11291                                                             s/case_Mapping//r,
11292                                         Range_Size_1 => 1,
11293                                         Format => $HEX_FORMAT,
11294                                         Default_Map => $CODE_POINT,
11295                                         UCD => 0,
11296                                         Initialize => $full_table,
11297                                         To_Output_Map => $EXTERNAL_MAP,
11298             );
11299
11300             $full_table->add_comment(join_lines( <<END
11301 This file includes both the simple and full case changing maps.  The simple
11302 ones are in the main body of the table below, and the full ones adding to or
11303 overriding them are in the hash.
11304 END
11305             ));
11306
11307             # The simple version's name in each mapping merely has an 's' in
11308             # front of the full one's
11309             my $simple_name = 's' . $full_name;
11310             my $simple = property_ref($simple_name);
11311             $simple->initialize($full_table) if $simple->to_output_map();
11312         }
11313
11314         return;
11315     }
11316
11317     sub filter_2_1_8_special_casing_line {
11318
11319         # This version had duplicate entries in this file.  Delete all but the
11320         # first one
11321         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11322                                               # fields
11323         if (exists $special_casing_code_points{$fields[0]}) {
11324             $_ = "";
11325             return;
11326         }
11327
11328         $special_casing_code_points{$fields[0]} = 1;
11329         filter_special_casing_line(@_);
11330     }
11331
11332     sub filter_special_casing_line {
11333         # Change the format of $_ from SpecialCasing.txt into something that
11334         # the generic handler understands.  Each input line contains three
11335         # case mappings.  This will generate three lines to pass to the
11336         # generic handler for each of those.
11337
11338         # The input syntax (after stripping comments and trailing white space
11339         # is like one of the following (with the final two being entries that
11340         # we ignore):
11341         # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
11342         # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
11343         # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
11344         # Note the trailing semi-colon, unlike many of the input files.  That
11345         # means that there will be an extra null field generated by the split
11346
11347         my $file = shift;
11348         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11349
11350         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11351                                               # fields
11352
11353         # field #4 is when this mapping is conditional.  If any of these get
11354         # implemented, it would be by hard-coding in the casing functions in
11355         # the Perl core, not through tables.  But if there is a new condition
11356         # we don't know about, output a warning.  We know about all the
11357         # conditions through 6.0
11358         if ($fields[4] ne "") {
11359             my @conditions = split ' ', $fields[4];
11360             if ($conditions[0] ne 'tr'  # We know that these languages have
11361                                         # conditions, and some are multiple
11362                 && $conditions[0] ne 'az'
11363                 && $conditions[0] ne 'lt'
11364
11365                 # And, we know about a single condition Final_Sigma, but
11366                 # nothing else.
11367                 && ($v_version gt v5.2.0
11368                     && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
11369             {
11370                 $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
11371             }
11372             elsif ($conditions[0] ne 'Final_Sigma') {
11373
11374                     # Don't print out a message for Final_Sigma, because we
11375                     # have hard-coded handling for it.  (But the standard
11376                     # could change what the rule should be, but it wouldn't
11377                     # show up here anyway.
11378
11379                     print "# SKIPPING Special Casing: $_\n"
11380                                                     if $verbosity >= $VERBOSE;
11381             }
11382             $_ = "";
11383             return;
11384         }
11385         elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
11386             $file->carp_bad_line('Extra fields');
11387             $_ = "";
11388             return;
11389         }
11390
11391         my $decimal_code_point = hex $fields[0];
11392
11393         # Loop to handle each of the three mappings in the input line, in
11394         # order, with $i indicating the current field number.
11395         my $i = 0;
11396         for my $object ($lc, $tc, $uc) {
11397             $i++;   # First time through, $i = 0 ... 3rd time = 3
11398
11399             my $value = $object->value_of($decimal_code_point);
11400             $value = ($value eq $CODE_POINT)
11401                       ? $decimal_code_point
11402                       : hex $value;
11403
11404             # If this isn't a multi-character mapping, it should already have
11405             # been read in.
11406             if ($fields[$i] !~ / /) {
11407                 if ($value != hex $fields[$i]) {
11408                     Carp::my_carp("Bad news. UnicodeData.txt thinks "
11409                                   . $object->name
11410                                   . "(0x$fields[0]) is $value"
11411                                   . " and SpecialCasing.txt thinks it is "
11412                                   . hex($fields[$i])
11413                                   . ".  Good luck.  Retaining UnicodeData value, and proceeding anyway.");
11414                 }
11415             }
11416             else {
11417
11418                 # The mapping goes into both the legacy table, in which it
11419                 # replaces the simple one...
11420                 $file->insert_adjusted_lines("$fields[0]; Legacy_"
11421                                              . $object->full_name
11422                                              . "; $fields[$i]");
11423
11424                 # ... and, the The regular table, in which it is additional,
11425                 # beyond the simple mapping.
11426                 $file->insert_adjusted_lines("$fields[0]; "
11427                                              . $object->name
11428                                             . "; "
11429                                             . $CMD_DELIM
11430                                             . "$REPLACE_CMD=$MULTIPLE_BEFORE"
11431                                             . $CMD_DELIM
11432                                             . $fields[$i]);
11433             }
11434         }
11435
11436         # Everything has been handled by the insert_adjusted_lines()
11437         $_ = "";
11438
11439         return;
11440     }
11441 }
11442
11443 sub filter_old_style_case_folding {
11444     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
11445     # and later style.  Different letters were used in the earlier.
11446
11447     my $file = shift;
11448     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11449
11450     my @fields = split /\s*;\s*/;
11451     if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
11452         $fields[1] = 'I';
11453     }
11454     elsif ($fields[1] eq 'L') {
11455         $fields[1] = 'C';             # L => C always
11456     }
11457     elsif ($fields[1] eq 'E') {
11458         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
11459             $fields[1] = 'F'
11460         }
11461         else {
11462             $fields[1] = 'C'
11463         }
11464     }
11465     else {
11466         $file->carp_bad_line("Expecting L or E in second field");
11467         $_ = "";
11468         return;
11469     }
11470     $_ = join("; ", @fields) . ';';
11471     return;
11472 }
11473
11474 { # Closure for case folding
11475
11476     # Create the map for simple only if are going to output it, for otherwise
11477     # it takes no part in anything we do.
11478     my $to_output_simple;
11479     my $non_final_folds;
11480     my $all_folds;
11481
11482     sub setup_case_folding($) {
11483         # Read in the case foldings in CaseFolding.txt.  This handles both
11484         # simple and full case folding.
11485
11486         $to_output_simple
11487                         = property_ref('Simple_Case_Folding')->to_output_map;
11488
11489         if (! $to_output_simple) {
11490             property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
11491         }
11492
11493         $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
11494                            Perl_Extension => 1,
11495                            Fate => $INTERNAL_ONLY,
11496                            Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
11497                            );
11498         $all_folds = $perl->add_match_table("_Perl_Any_Folds",
11499                            Perl_Extension => 1,
11500                            Fate => $INTERNAL_ONLY,
11501                            Description => "Code points that particpate in some fold",
11502                            );
11503
11504         # If we ever wanted to show that these tables were combined, a new
11505         # property method could be created, like set_combined_props()
11506         property_ref('Case_Folding')->add_comment(join_lines( <<END
11507 This file includes both the simple and full case folding maps.  The simple
11508 ones are in the main body of the table below, and the full ones adding to or
11509 overriding them are in the hash.
11510 END
11511         ));
11512         return;
11513     }
11514
11515     sub filter_case_folding_line {
11516         # Called for each line in CaseFolding.txt
11517         # Input lines look like:
11518         # 0041; C; 0061; # LATIN CAPITAL LETTER A
11519         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
11520         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
11521         #
11522         # 'C' means that folding is the same for both simple and full
11523         # 'F' that it is only for full folding
11524         # 'S' that it is only for simple folding
11525         # 'T' is locale-dependent, and ignored
11526         # 'I' is a type of 'F' used in some early releases.
11527         # Note the trailing semi-colon, unlike many of the input files.  That
11528         # means that there will be an extra null field generated by the split
11529         # below, which we ignore and hence is not an error.
11530
11531         my $file = shift;
11532         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11533
11534         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
11535         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
11536             $file->carp_bad_line('Extra fields');
11537             $_ = "";
11538             return;
11539         }
11540
11541         if ($type =~ / ^ [IT] $/x) {   # Skip Turkic case folding, is locale dependent
11542             $_ = "";
11543             return;
11544         }
11545
11546         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
11547         # I are all full foldings; S is single-char.  For S, there is always
11548         # an F entry, so we must allow multiple values for the same code
11549         # point.  Fortunately this table doesn't need further manipulation
11550         # which would preclude using multiple-values.  The S is now included
11551         # so that _swash_inversion_hash() is able to construct closures
11552         # without having to worry about F mappings.
11553         if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
11554             $all_folds->add_range(hex $range, hex $range);  # Assumes range is single
11555             $_ = "$range; Case_Folding; "
11556                  . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
11557
11558             if ($type eq 'F') {
11559                 my @string = split " ", $map;
11560                 for my $i (0 .. @string  - 1 -1) {
11561                     my $decimal = hex $string[$i];
11562                     $non_final_folds->add_range($decimal, $decimal);
11563                     $all_folds->add_range($decimal, $decimal);
11564                 }
11565             }
11566             else {
11567                 $all_folds->add_range(hex $map, hex $map);
11568             }
11569         }
11570         else {
11571             $_ = "";
11572             $file->carp_bad_line('Expecting C F I S or T in second field');
11573         }
11574
11575         # C and S are simple foldings, but simple case folding is not needed
11576         # unless we explicitly want its map table output.
11577         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
11578             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
11579         }
11580
11581         return;
11582     }
11583
11584 } # End case fold closure
11585
11586 sub filter_jamo_line {
11587     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
11588     # from this file that is used in generating the Name property for Jamo
11589     # code points.  But, it also is used to convert early versions' syntax
11590     # into the modern form.  Here are two examples:
11591     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
11592     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
11593     #
11594     # The input is $_, the output is $_ filtered.
11595
11596     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
11597
11598     # Let the caller handle unexpected input.  In earlier versions, there was
11599     # a third field which is supposed to be a comment, but did not have a '#'
11600     # before it.
11601     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
11602
11603     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
11604                                 # beginning.
11605
11606     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
11607     $fields[1] = 'R' if $fields[0] eq '1105';
11608
11609     # Add to structure so can generate Names from it.
11610     my $cp = hex $fields[0];
11611     my $short_name = $fields[1];
11612     $Jamo{$cp} = $short_name;
11613     if ($cp <= $LBase + $LCount) {
11614         $Jamo_L{$short_name} = $cp - $LBase;
11615     }
11616     elsif ($cp <= $VBase + $VCount) {
11617         $Jamo_V{$short_name} = $cp - $VBase;
11618     }
11619     elsif ($cp <= $TBase + $TCount) {
11620         $Jamo_T{$short_name} = $cp - $TBase;
11621     }
11622     else {
11623         Carp::my_carp_bug("Unexpected Jamo code point in $_");
11624     }
11625
11626
11627     # Reassemble using just the first two fields to look like a typical
11628     # property file line
11629     $_ = "$fields[0]; $fields[1]";
11630
11631     return;
11632 }
11633
11634 sub register_fraction($) {
11635     # This registers the input rational number so that it can be passed on to
11636     # utf8_heavy.pl, both in rational and floating forms.
11637
11638     my $rational = shift;
11639
11640     my $float = eval $rational;
11641     $nv_floating_to_rational{$float} = $rational;
11642     return;
11643 }
11644
11645 sub filter_numeric_value_line {
11646     # DNumValues contains lines of a different syntax than the typical
11647     # property file:
11648     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
11649     #
11650     # This routine transforms $_ containing the anomalous syntax to the
11651     # typical, by filtering out the extra columns, and convert early version
11652     # decimal numbers to strings that look like rational numbers.
11653
11654     my $file = shift;
11655     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11656
11657     # Starting in 5.1, there is a rational field.  Just use that, omitting the
11658     # extra columns.  Otherwise convert the decimal number in the second field
11659     # to a rational, and omit extraneous columns.
11660     my @fields = split /\s*;\s*/, $_, -1;
11661     my $rational;
11662
11663     if ($v_version ge v5.1.0) {
11664         if (@fields != 4) {
11665             $file->carp_bad_line('Not 4 semi-colon separated fields');
11666             $_ = "";
11667             return;
11668         }
11669         $rational = $fields[3];
11670         $_ = join '; ', @fields[ 0, 3 ];
11671     }
11672     else {
11673
11674         # Here, is an older Unicode file, which has decimal numbers instead of
11675         # rationals in it.  Use the fraction to calculate the denominator and
11676         # convert to rational.
11677
11678         if (@fields != 2 && @fields != 3) {
11679             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
11680             $_ = "";
11681             return;
11682         }
11683
11684         my $codepoints = $fields[0];
11685         my $decimal = $fields[1];
11686         if ($decimal =~ s/\.0+$//) {
11687
11688             # Anything ending with a decimal followed by nothing but 0's is an
11689             # integer
11690             $_ = "$codepoints; $decimal";
11691             $rational = $decimal;
11692         }
11693         else {
11694
11695             my $denominator;
11696             if ($decimal =~ /\.50*$/) {
11697                 $denominator = 2;
11698             }
11699
11700             # Here have the hardcoded repeating decimals in the fraction, and
11701             # the denominator they imply.  There were only a few denominators
11702             # in the older Unicode versions of this file which this code
11703             # handles, so it is easy to convert them.
11704
11705             # The 4 is because of a round-off error in the Unicode 3.2 files
11706             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11707                 $denominator = 3;
11708             }
11709             elsif ($decimal =~ /\.[27]50*$/) {
11710                 $denominator = 4;
11711             }
11712             elsif ($decimal =~ /\.[2468]0*$/) {
11713                 $denominator = 5;
11714             }
11715             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11716                 $denominator = 6;
11717             }
11718             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11719                 $denominator = 8;
11720             }
11721             if ($denominator) {
11722                 my $sign = ($decimal < 0) ? "-" : "";
11723                 my $numerator = int((abs($decimal) * $denominator) + .5);
11724                 $rational = "$sign$numerator/$denominator";
11725                 $_ = "$codepoints; $rational";
11726             }
11727             else {
11728                 $file->carp_bad_line("Can't cope with number '$decimal'.");
11729                 $_ = "";
11730                 return;
11731             }
11732         }
11733     }
11734
11735     register_fraction($rational) if $rational =~ qr{/};
11736     return;
11737 }
11738
11739 { # Closure
11740     my %unihan_properties;
11741
11742     sub setup_unihan {
11743         # Do any special setup for Unihan properties.
11744
11745         # This property gives the wrong computed type, so override.
11746         my $usource = property_ref('kIRG_USource');
11747         $usource->set_type($STRING) if defined $usource;
11748
11749         # This property is to be considered binary (it says so in
11750         # http://www.unicode.org/reports/tr38/)
11751         my $iicore = property_ref('kIICore');
11752         if (defined $iicore) {
11753             $iicore->set_type($FORCED_BINARY);
11754             $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
11755
11756             # Unicode doesn't include the maps for this property, so don't
11757             # warn that they are missing.
11758             $iicore->set_pre_declared_maps(0);
11759             $iicore->add_comment(join_lines( <<END
11760 This property contains enum values, but Unicode UAX #38 says it should be
11761 interpreted as binary, so Perl creates tables for both 1) its enum values,
11762 plus 2) true/false tables in which it is considered true for all code points
11763 that have a non-null value
11764 END
11765             ));
11766         }
11767
11768         return;
11769     }
11770
11771     sub filter_unihan_line {
11772         # Change unihan db lines to look like the others in the db.  Here is
11773         # an input sample:
11774         #   U+341C        kCangjie        IEKN
11775
11776         # Tabs are used instead of semi-colons to separate fields; therefore
11777         # they may have semi-colons embedded in them.  Change these to periods
11778         # so won't screw up the rest of the code.
11779         s/;/./g;
11780
11781         # Remove lines that don't look like ones we accept.
11782         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11783             $_ = "";
11784             return;
11785         }
11786
11787         # Extract the property, and save a reference to its object.
11788         my $property = $1;
11789         if (! exists $unihan_properties{$property}) {
11790             $unihan_properties{$property} = property_ref($property);
11791         }
11792
11793         # Don't do anything unless the property is one we're handling, which
11794         # we determine by seeing if there is an object defined for it or not
11795         if (! defined $unihan_properties{$property}) {
11796             $_ = "";
11797             return;
11798         }
11799
11800         # Convert the tab separators to our standard semi-colons, and convert
11801         # the U+HHHH notation to the rest of the standard's HHHH
11802         s/\t/;/g;
11803         s/\b U \+ (?= $code_point_re )//xg;
11804
11805         #local $to_trace = 1 if main::DEBUG;
11806         trace $_ if main::DEBUG && $to_trace;
11807
11808         return;
11809     }
11810 }
11811
11812 sub filter_blocks_lines {
11813     # In the Blocks.txt file, the names of the blocks don't quite match the
11814     # names given in PropertyValueAliases.txt, so this changes them so they
11815     # do match:  Blanks and hyphens are changed into underscores.  Also makes
11816     # early release versions look like later ones
11817     #
11818     # $_ is transformed to the correct value.
11819
11820     my $file = shift;
11821         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11822
11823     if ($v_version lt v3.2.0) {
11824         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11825             $_ = "";
11826             return;
11827         }
11828
11829         # Old versions used a different syntax to mark the range.
11830         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11831     }
11832
11833     my @fields = split /\s*;\s*/, $_, -1;
11834     if (@fields != 2) {
11835         $file->carp_bad_line("Expecting exactly two fields");
11836         $_ = "";
11837         return;
11838     }
11839
11840     # Change hyphens and blanks in the block name field only
11841     $fields[1] =~ s/[ -]/_/g;
11842     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
11843
11844     $_ = join("; ", @fields);
11845     return;
11846 }
11847
11848 { # Closure
11849     my $current_property;
11850
11851     sub filter_old_style_proplist {
11852         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
11853         # was in a completely different syntax.  Ken Whistler of Unicode says
11854         # that it was something he used as an aid for his own purposes, but
11855         # was never an official part of the standard.  Many of the properties
11856         # in it were incorporated into the later PropList.txt, but some were
11857         # not.  This program uses this early file to generate property tables
11858         # that are otherwise not accessible in the early UCD's.  It does this
11859         # for the ones that eventually became official, and don't appear to be
11860         # too different in their contents from the later official version, and
11861         # throws away the rest.  It could be argued that the ones it generates
11862         # were probably not really official at that time, so should be
11863         # ignored.  You can easily modify things to skip all of them by
11864         # changing this function to just set $_ to "", and return; and to skip
11865         # certain of them by by simply removing their declarations from
11866         # get_old_property_aliases().
11867         #
11868         # Here is a list of all the ones that are thrown away:
11869         #   Alphabetic                   The definitions for this are very
11870         #                                defective, so better to not mislead
11871         #                                people into thinking it works.
11872         #                                Instead the Perl extension of the
11873         #                                same name is constructed from first
11874         #                                principles.
11875         #   Bidi=*                       duplicates UnicodeData.txt
11876         #   Combining                    never made into official property;
11877         #                                is \P{ccc=0}
11878         #   Composite                    never made into official property.
11879         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
11880         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
11881         #   Delimiter                    never made into official property;
11882         #                                removed in 3.0.1
11883         #   Format Control               never made into official property;
11884         #                                similar to gc=cf
11885         #   High Surrogate               duplicates Blocks.txt
11886         #   Ignorable Control            never made into official property;
11887         #                                similar to di=y
11888         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
11889         #   Left of Pair                 never made into official property;
11890         #   Line Separator               duplicates UnicodeData.txt: gc=zl
11891         #   Low Surrogate                duplicates Blocks.txt
11892         #   Non-break                    was actually listed as a property
11893         #                                in 3.2, but without any code
11894         #                                points.  Unicode denies that this
11895         #                                was ever an official property
11896         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
11897         #   Numeric                      duplicates UnicodeData.txt: gc=cc
11898         #   Paired Punctuation           never made into official property;
11899         #                                appears to be gc=ps + gc=pe
11900         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
11901         #   Private Use                  duplicates UnicodeData.txt: gc=co
11902         #   Private Use High Surrogate   duplicates Blocks.txt
11903         #   Punctuation                  duplicates UnicodeData.txt: gc=p
11904         #   Space                        different definition than eventual
11905         #                                one.
11906         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
11907         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cn
11908         #   Zero-width                   never made into official property;
11909         #                                subset of gc=cf
11910         # Most of the properties have the same names in this file as in later
11911         # versions, but a couple do not.
11912         #
11913         # This subroutine filters $_, converting it from the old style into
11914         # the new style.  Here's a sample of the old-style
11915         #
11916         #   *******************************************
11917         #
11918         #   Property dump for: 0x100000A0 (Join Control)
11919         #
11920         #   200C..200D  (2 chars)
11921         #
11922         # In the example, the property is "Join Control".  It is kept in this
11923         # closure between calls to the subroutine.  The numbers beginning with
11924         # 0x were internal to Ken's program that generated this file.
11925
11926         # If this line contains the property name, extract it.
11927         if (/^Property dump for: [^(]*\((.*)\)/) {
11928             $_ = $1;
11929
11930             # Convert white space to underscores.
11931             s/ /_/g;
11932
11933             # Convert the few properties that don't have the same name as
11934             # their modern counterparts
11935             s/Identifier_Part/ID_Continue/
11936             or s/Not_a_Character/NChar/;
11937
11938             # If the name matches an existing property, use it.
11939             if (defined property_ref($_)) {
11940                 trace "new property=", $_ if main::DEBUG && $to_trace;
11941                 $current_property = $_;
11942             }
11943             else {        # Otherwise discard it
11944                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11945                 undef $current_property;
11946             }
11947             $_ = "";    # The property is saved for the next lines of the
11948                         # file, but this defining line is of no further use,
11949                         # so clear it so that the caller won't process it
11950                         # further.
11951         }
11952         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11953
11954             # Here, the input line isn't a header defining a property for the
11955             # following section, and either we aren't in such a section, or
11956             # the line doesn't look like one that defines the code points in
11957             # such a section.  Ignore this line.
11958             $_ = "";
11959         }
11960         else {
11961
11962             # Here, we have a line defining the code points for the current
11963             # stashed property.  Anything starting with the first blank is
11964             # extraneous.  Otherwise, it should look like a normal range to
11965             # the caller.  Append the property name so that it looks just like
11966             # a modern PropList entry.
11967
11968             $_ =~ s/\s.*//;
11969             $_ .= "; $current_property";
11970         }
11971         trace $_ if main::DEBUG && $to_trace;
11972         return;
11973     }
11974 } # End closure for old style proplist
11975
11976 sub filter_old_style_normalization_lines {
11977     # For early releases of Unicode, the lines were like:
11978     #        74..2A76    ; NFKD_NO
11979     # For later releases this became:
11980     #        74..2A76    ; NFKD_QC; N
11981     # Filter $_ to look like those in later releases.
11982     # Similarly for MAYBEs
11983
11984     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11985
11986     # Also, the property FC_NFKC was abbreviated to FNC
11987     s/FNC/FC_NFKC/;
11988     return;
11989 }
11990
11991 sub setup_script_extensions {
11992     # The Script_Extensions property starts out with a clone of the Script
11993     # property.
11994
11995     my $scx = property_ref("Script_Extensions");
11996     $scx = Property->new("scx", Full_Name => "Script_Extensions")
11997                                                             if ! defined $scx;
11998     $scx->_set_format($STRING_WHITE_SPACE_LIST);
11999     $scx->initialize($script);
12000     $scx->set_default_map($script->default_map);
12001     $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
12002     $scx->add_comment(join_lines( <<END
12003 The values for code points that appear in one script are just the same as for
12004 the 'Script' property.  Likewise the values for those that appear in many
12005 scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
12006 values of code points that appear in a few scripts are a space separated list
12007 of those scripts.
12008 END
12009     ));
12010
12011     # Initialize scx's tables and the aliases for them to be the same as sc's
12012     foreach my $table ($script->tables) {
12013         my $scx_table = $scx->add_match_table($table->name,
12014                                 Full_Name => $table->full_name);
12015         foreach my $alias ($table->aliases) {
12016             $scx_table->add_alias($alias->name);
12017         }
12018     }
12019 }
12020
12021 sub  filter_script_extensions_line {
12022     # The Scripts file comes with the full name for the scripts; the
12023     # ScriptExtensions, with the short name.  The final mapping file is a
12024     # combination of these, and without adjustment, would have inconsistent
12025     # entries.  This filters the latter file to convert to full names.
12026     # Entries look like this:
12027     # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
12028
12029     my @fields = split /\s*;\s*/;
12030
12031     # This script was erroneously omitted in this Unicode version.
12032     $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
12033
12034     my @full_names;
12035     foreach my $short_name (split " ", $fields[1]) {
12036         push @full_names, $script->table($short_name)->full_name;
12037     }
12038     $fields[1] = join " ", @full_names;
12039     $_ = join "; ", @fields;
12040
12041     return;
12042 }
12043
12044 sub generate_hst {
12045
12046     # Populates the Hangul Syllable Type property from first principles
12047
12048     my $file= shift;
12049     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12050
12051     # These few ranges are hard-coded in.
12052     $file->insert_lines(split /\n/, <<'END'
12053 1100..1159    ; L
12054 115F          ; L
12055 1160..11A2    ; V
12056 11A8..11F9    ; T
12057 END
12058 );
12059
12060     # The Hangul syllables in version 1 are completely different than what came
12061     # after, so just ignore them there.
12062     if ($v_version lt v2.0.0) {
12063         my $property = property_ref($file->property);
12064         push @tables_that_may_be_empty, $property->table('LV')->complete_name;
12065         push @tables_that_may_be_empty, $property->table('LVT')->complete_name;
12066         return;
12067     }
12068
12069     # The algorithmically derived syllables are almost all LVT ones, so
12070     # initialize the whole range with that.
12071     $file->insert_lines(sprintf "%04X..%04X; LVT\n",
12072                         $SBase, $SBase + $SCount -1);
12073
12074     # Those ones that aren't LVT are LV, and they occur at intervals of
12075     # $TCount code points, starting with the first code point, at $SBase.
12076     for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
12077         $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
12078     }
12079
12080     return;
12081 }
12082
12083 sub generate_GCB {
12084
12085     # Populates the Grapheme Cluster Break property from first principles
12086
12087     my $file= shift;
12088     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12089
12090     # All these definitions are from
12091     # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
12092     # from http://www.unicode.org/reports/tr29/tr29-4.html
12093
12094     foreach my $range ($gc->ranges) {
12095
12096         # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
12097         # and gc=Cf
12098         if ($range->value =~ / ^ M [en] $ /x) {
12099             $file->insert_lines(sprintf "%04X..%04X; Extend",
12100                                 $range->start,  $range->end);
12101         }
12102         elsif ($range->value =~ / ^ C [cf] $ /x) {
12103             $file->insert_lines(sprintf "%04X..%04X; Control",
12104                                 $range->start,  $range->end);
12105         }
12106     }
12107     $file->insert_lines("2028; Control"); # Line Separator
12108     $file->insert_lines("2029; Control"); # Paragraph Separator
12109
12110     $file->insert_lines("000D; CR");
12111     $file->insert_lines("000A; LF");
12112
12113     # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
12114     foreach my $code_point ( qw{
12115                                 40000
12116                                 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
12117                                 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
12118                                 }
12119     ) {
12120         my $category = $gc->value_of(hex $code_point);
12121         next if ! defined $category || $category eq 'Cn'; # But not if
12122                                                           # unassigned in this
12123                                                           # release
12124         $file->insert_lines("$code_point; Extend");
12125     }
12126
12127     my $hst = property_ref('Hangul_Syllable_Type');
12128     if ($hst->count > 0) {
12129         foreach my $range ($hst->ranges) {
12130             $file->insert_lines(sprintf "%04X..%04X; %s",
12131                                     $range->start, $range->end, $range->value);
12132         }
12133     }
12134     else {
12135         generate_hst($file);
12136     }
12137
12138     return;
12139 }
12140
12141 sub setup_early_name_alias {
12142     my $file= shift;
12143     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12144
12145     # This has the effect of pretending that the Name_Alias property was
12146     # available in all Unicode releases.  Strictly speaking, this property
12147     # should not be availabe in early releases, but doing this allows
12148     # charnames.pm to work on older releases without change.  Prior to v5.16
12149     # it had these names hard-coded inside it.  Unicode 6.1 came along and
12150     # created these names, and so they were removed from charnames.
12151
12152     my $aliases = property_ref('Name_Alias');
12153     if (! defined $aliases) {
12154         $aliases = Property->new('Name_Alias', Default_Map => "");
12155     }
12156
12157     $file->insert_lines(get_old_name_aliases());
12158
12159     return;
12160 }
12161
12162 sub get_old_name_aliases () {
12163
12164     # The Unicode_1_Name field, contains most of these names.  One would
12165     # expect, given the field's name, that its values would be fixed across
12166     # versions, giving the true Unicode version 1 name for the character.
12167     # Sadly, this is not the case.  Actually Version 1.1.5 had no names for
12168     # any of the controls; Version 2.0 introduced names for the C0 controls,
12169     # and 3.0 introduced C1 names.  3.0.1 removed the name INDEX; and 3.2
12170     # changed some names: it
12171     #   changed to parenthesized versions like "NEXT LINE" to
12172     #       "NEXT LINE (NEL)";
12173     #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
12174     #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
12175     #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
12176     # This list contains all the names that were defined so that
12177     # charnames::vianame(), etc. understand them all EVEN if this version of
12178     # Unicode didn't specify them (this could be construed as a bug).
12179     # mktables elsewhere gives preference to the Unicode_1_Name field over
12180     # these names, so that viacode() will return the correct value for that
12181     # version of Unicode, except when that version doesn't define a name,
12182     # viacode() will return one anyway (this also could be construed as a
12183     # bug).  But these potential "bugs" allow for the smooth working of code
12184     # on earlier Unicode releases.
12185
12186     my @return = split /\n/, <<'END';
12187 0000;NULL;control
12188 0000;NUL;abbreviation
12189 0001;START OF HEADING;control
12190 0001;SOH;abbreviation
12191 0002;START OF TEXT;control
12192 0002;STX;abbreviation
12193 0003;END OF TEXT;control
12194 0003;ETX;abbreviation
12195 0004;END OF TRANSMISSION;control
12196 0004;EOT;abbreviation
12197 0005;ENQUIRY;control
12198 0005;ENQ;abbreviation
12199 0006;ACKNOWLEDGE;control
12200 0006;ACK;abbreviation
12201 0007;BELL;control
12202 0007;BEL;abbreviation
12203 0008;BACKSPACE;control
12204 0008;BS;abbreviation
12205 0009;CHARACTER TABULATION;control
12206 0009;HORIZONTAL TABULATION;control
12207 0009;HT;abbreviation
12208 0009;TAB;abbreviation
12209 000A;LINE FEED;control
12210 000A;LINE FEED (LF);control
12211 000A;NEW LINE;control
12212 000A;END OF LINE;control
12213 000A;LF;abbreviation
12214 000A;NL;abbreviation
12215 000A;EOL;abbreviation
12216 000B;LINE TABULATION;control
12217 000B;VERTICAL TABULATION;control
12218 000B;VT;abbreviation
12219 000C;FORM FEED;control
12220 000C;FORM FEED (FF);control
12221 000C;FF;abbreviation
12222 000D;CARRIAGE RETURN;control
12223 000D;CARRIAGE RETURN (CR);control
12224 000D;CR;abbreviation
12225 000E;SHIFT OUT;control
12226 000E;LOCKING-SHIFT ONE;control
12227 000E;SO;abbreviation
12228 000F;SHIFT IN;control
12229 000F;LOCKING-SHIFT ZERO;control
12230 000F;SI;abbreviation
12231 0010;DATA LINK ESCAPE;control
12232 0010;DLE;abbreviation
12233 0011;DEVICE CONTROL ONE;control
12234 0011;DC1;abbreviation
12235 0012;DEVICE CONTROL TWO;control
12236 0012;DC2;abbreviation
12237 0013;DEVICE CONTROL THREE;control
12238 0013;DC3;abbreviation
12239 0014;DEVICE CONTROL FOUR;control
12240 0014;DC4;abbreviation
12241 0015;NEGATIVE ACKNOWLEDGE;control
12242 0015;NAK;abbreviation
12243 0016;SYNCHRONOUS IDLE;control
12244 0016;SYN;abbreviation
12245 0017;END OF TRANSMISSION BLOCK;control
12246 0017;ETB;abbreviation
12247 0018;CANCEL;control
12248 0018;CAN;abbreviation
12249 0019;END OF MEDIUM;control
12250 0019;EOM;abbreviation
12251 001A;SUBSTITUTE;control
12252 001A;SUB;abbreviation
12253 001B;ESCAPE;control
12254 001B;ESC;abbreviation
12255 001C;INFORMATION SEPARATOR FOUR;control
12256 001C;FILE SEPARATOR;control
12257 001C;FS;abbreviation
12258 001D;INFORMATION SEPARATOR THREE;control
12259 001D;GROUP SEPARATOR;control
12260 001D;GS;abbreviation
12261 001E;INFORMATION SEPARATOR TWO;control
12262 001E;RECORD SEPARATOR;control
12263 001E;RS;abbreviation
12264 001F;INFORMATION SEPARATOR ONE;control
12265 001F;UNIT SEPARATOR;control
12266 001F;US;abbreviation
12267 0020;SP;abbreviation
12268 007F;DELETE;control
12269 007F;DEL;abbreviation
12270 0080;PADDING CHARACTER;figment
12271 0080;PAD;abbreviation
12272 0081;HIGH OCTET PRESET;figment
12273 0081;HOP;abbreviation
12274 0082;BREAK PERMITTED HERE;control
12275 0082;BPH;abbreviation
12276 0083;NO BREAK HERE;control
12277 0083;NBH;abbreviation
12278 0084;INDEX;control
12279 0084;IND;abbreviation
12280 0085;NEXT LINE;control
12281 0085;NEXT LINE (NEL);control
12282 0085;NEL;abbreviation
12283 0086;START OF SELECTED AREA;control
12284 0086;SSA;abbreviation
12285 0087;END OF SELECTED AREA;control
12286 0087;ESA;abbreviation
12287 0088;CHARACTER TABULATION SET;control
12288 0088;HORIZONTAL TABULATION SET;control
12289 0088;HTS;abbreviation
12290 0089;CHARACTER TABULATION WITH JUSTIFICATION;control
12291 0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
12292 0089;HTJ;abbreviation
12293 008A;LINE TABULATION SET;control
12294 008A;VERTICAL TABULATION SET;control
12295 008A;VTS;abbreviation
12296 008B;PARTIAL LINE FORWARD;control
12297 008B;PARTIAL LINE DOWN;control
12298 008B;PLD;abbreviation
12299 008C;PARTIAL LINE BACKWARD;control
12300 008C;PARTIAL LINE UP;control
12301 008C;PLU;abbreviation
12302 008D;REVERSE LINE FEED;control
12303 008D;REVERSE INDEX;control
12304 008D;RI;abbreviation
12305 008E;SINGLE SHIFT TWO;control
12306 008E;SINGLE-SHIFT-2;control
12307 008E;SS2;abbreviation
12308 008F;SINGLE SHIFT THREE;control
12309 008F;SINGLE-SHIFT-3;control
12310 008F;SS3;abbreviation
12311 0090;DEVICE CONTROL STRING;control
12312 0090;DCS;abbreviation
12313 0091;PRIVATE USE ONE;control
12314 0091;PRIVATE USE-1;control
12315 0091;PU1;abbreviation
12316 0092;PRIVATE USE TWO;control
12317 0092;PRIVATE USE-2;control
12318 0092;PU2;abbreviation
12319 0093;SET TRANSMIT STATE;control
12320 0093;STS;abbreviation
12321 0094;CANCEL CHARACTER;control
12322 0094;CCH;abbreviation
12323 0095;MESSAGE WAITING;control
12324 0095;MW;abbreviation
12325 0096;START OF GUARDED AREA;control
12326 0096;START OF PROTECTED AREA;control
12327 0096;SPA;abbreviation
12328 0097;END OF GUARDED AREA;control
12329 0097;END OF PROTECTED AREA;control
12330 0097;EPA;abbreviation
12331 0098;START OF STRING;control
12332 0098;SOS;abbreviation
12333 0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
12334 0099;SGC;abbreviation
12335 009A;SINGLE CHARACTER INTRODUCER;control
12336 009A;SCI;abbreviation
12337 009B;CONTROL SEQUENCE INTRODUCER;control
12338 009B;CSI;abbreviation
12339 009C;STRING TERMINATOR;control
12340 009C;ST;abbreviation
12341 009D;OPERATING SYSTEM COMMAND;control
12342 009D;OSC;abbreviation
12343 009E;PRIVACY MESSAGE;control
12344 009E;PM;abbreviation
12345 009F;APPLICATION PROGRAM COMMAND;control
12346 009F;APC;abbreviation
12347 00A0;NBSP;abbreviation
12348 00AD;SHY;abbreviation
12349 200B;ZWSP;abbreviation
12350 200C;ZWNJ;abbreviation
12351 200D;ZWJ;abbreviation
12352 200E;LRM;abbreviation
12353 200F;RLM;abbreviation
12354 202A;LRE;abbreviation
12355 202B;RLE;abbreviation
12356 202C;PDF;abbreviation
12357 202D;LRO;abbreviation
12358 202E;RLO;abbreviation
12359 FEFF;BYTE ORDER MARK;alternate
12360 FEFF;BOM;abbreviation
12361 FEFF;ZWNBSP;abbreviation
12362 END
12363
12364     if ($v_version ge v3.0.0) {
12365         push @return, split /\n/, <<'END';
12366 180B; FVS1; abbreviation
12367 180C; FVS2; abbreviation
12368 180D; FVS3; abbreviation
12369 180E; MVS; abbreviation
12370 202F; NNBSP; abbreviation
12371 END
12372     }
12373
12374     if ($v_version ge v3.2.0) {
12375         push @return, split /\n/, <<'END';
12376 034F; CGJ; abbreviation
12377 205F; MMSP; abbreviation
12378 2060; WJ; abbreviation
12379 END
12380         # Add in VS1..VS16
12381         my $cp = 0xFE00 - 1;
12382         for my $i (1..16) {
12383             push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12384         }
12385     }
12386     if ($v_version ge v4.0.0) { # Add in VS17..VS256
12387         my $cp = 0xE0100 - 17;
12388         for my $i (17..256) {
12389             push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12390         }
12391     }
12392
12393     # ALERT did not come along until 6.0, at which point it became preferred
12394     # over BELL, and was never in the Unicode_1_Name field.  For the same
12395     # reasons, that the other names are made known to all releases by this
12396     # function, we make ALERT known too.  By inserting it
12397     # last in early releases, BELL is preferred over it; and vice-vers in 6.0
12398     my $alert = '0007; ALERT; control';
12399     if ($v_version lt v6.0.0) {
12400         push @return, $alert;
12401     }
12402     else {
12403         unshift @return, $alert;
12404     }
12405
12406     return @return;
12407 }
12408
12409 sub filter_later_version_name_alias_line {
12410
12411     # This file has an extra entry per line for the alias type.  This is
12412     # handled by creating a compound entry: "$alias: $type";  First, split
12413     # the line into components.
12414     my ($range, $alias, $type, @remainder)
12415         = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12416
12417     # This file contains multiple entries for some components, so tell the
12418     # downstream code to allow this in our internal tables; the
12419     # $MULTIPLE_AFTER preserves the input ordering.
12420     $_ = join ";", $range, $CMD_DELIM
12421                            . $REPLACE_CMD
12422                            . '='
12423                            . $MULTIPLE_AFTER
12424                            . $CMD_DELIM
12425                            . "$alias: $type",
12426                    @remainder;
12427     return;
12428 }
12429
12430 sub filter_early_version_name_alias_line {
12431
12432     # Early versions did not have the trailing alias type field; implicitly it
12433     # was 'correction'.   But our synthetic lines we add in this program do
12434     # have it, so test for the type field.
12435     $_ .= "; correction" if $_ !~ /;.*;/;
12436
12437     filter_later_version_name_alias_line;
12438     return;
12439 }
12440
12441 sub finish_Unicode() {
12442     # This routine should be called after all the Unicode files have been read
12443     # in.  It:
12444     # 1) Creates properties that are missing from the version of Unicode being
12445     #    compiled, and which, for whatever reason, are needed for the Perl
12446     #    core to function properly.  These are minimally populated as
12447     #    necessary.
12448     # 2) Adds the mappings for code points missing from the files which have
12449     #    defaults specified for them.
12450     # 3) At this this point all mappings are known, so it computes the type of
12451     #    each property whose type hasn't been determined yet.
12452     # 4) Calculates all the regular expression match tables based on the
12453     #    mappings.
12454     # 5) Calculates and adds the tables which are defined by Unicode, but
12455     #    which aren't derived by them, and certain derived tables that Perl
12456     #    uses.
12457
12458     # Folding information was introduced later into Unicode data.  To get
12459     # Perl's case ignore (/i) to work at all in releases that don't have
12460     # folding, use the best available alternative, which is lower casing.
12461     my $fold = property_ref('Case_Folding');
12462     if ($fold->is_empty) {
12463         $fold->initialize(property_ref('Lowercase_Mapping'));
12464         $fold->add_note(join_lines(<<END
12465 WARNING: This table uses lower case as a substitute for missing fold
12466 information
12467 END
12468         ));
12469     }
12470
12471     # Multiple-character mapping was introduced later into Unicode data, so it
12472     # is by default the simple version.  If to output the simple versions and
12473     # not present, just use the regular (which in these Unicode versions is
12474     # the simple as well).
12475     foreach my $map (qw {   Uppercase_Mapping
12476                             Lowercase_Mapping
12477                             Titlecase_Mapping
12478                             Case_Folding
12479                         } )
12480     {
12481         my $simple = property_ref("Simple_$map");
12482         next if ! $simple->is_empty;
12483         if ($simple->to_output_map) {
12484             $simple->initialize(property_ref($map));
12485         }
12486         else {
12487             property_ref($map)->set_proxy_for($simple->name);
12488         }
12489     }
12490
12491     # For each property, fill in any missing mappings, and calculate the re
12492     # match tables.  If a property has more than one missing mapping, the
12493     # default is a reference to a data structure, and requires data from other
12494     # properties to resolve.  The sort is used to cause these to be processed
12495     # last, after all the other properties have been calculated.
12496     # (Fortunately, the missing properties so far don't depend on each other.)
12497     foreach my $property
12498         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
12499         property_ref('*'))
12500     {
12501         # $perl has been defined, but isn't one of the Unicode properties that
12502         # need to be finished up.
12503         next if $property == $perl;
12504
12505         # Nor do we need to do anything with properties that aren't going to
12506         # be output.
12507         next if $property->fate == $SUPPRESSED;
12508
12509         # Handle the properties that have more than one possible default
12510         if (ref $property->default_map) {
12511             my $default_map = $property->default_map;
12512
12513             # These properties have stored in the default_map:
12514             # One or more of:
12515             #   1)  A default map which applies to all code points in a
12516             #       certain class
12517             #   2)  an expression which will evaluate to the list of code
12518             #       points in that class
12519             # And
12520             #   3) the default map which applies to every other missing code
12521             #      point.
12522             #
12523             # Go through each list.
12524             while (my ($default, $eval) = $default_map->get_next_defaults) {
12525
12526                 # Get the class list, and intersect it with all the so-far
12527                 # unspecified code points yielding all the code points
12528                 # in the class that haven't been specified.
12529                 my $list = eval $eval;
12530                 if ($@) {
12531                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
12532                     last;
12533                 }
12534
12535                 # Narrow down the list to just those code points we don't have
12536                 # maps for yet.
12537                 $list = $list & $property->inverse_list;
12538
12539                 # Add mappings to the property for each code point in the list
12540                 foreach my $range ($list->ranges) {
12541                     $property->add_map($range->start, $range->end, $default,
12542                     Replace => $CROAK);
12543                 }
12544             }
12545
12546             # All remaining code points have the other mapping.  Set that up
12547             # so the normal single-default mapping code will work on them
12548             $property->set_default_map($default_map->other_default);
12549
12550             # And fall through to do that
12551         }
12552
12553         # We should have enough data now to compute the type of the property.
12554         $property->compute_type;
12555         my $property_type = $property->type;
12556
12557         next if ! $property->to_create_match_tables;
12558
12559         # Here want to create match tables for this property
12560
12561         # The Unicode db always (so far, and they claim into the future) have
12562         # the default for missing entries in binary properties be 'N' (unless
12563         # there is a '@missing' line that specifies otherwise)
12564         if ($property_type == $BINARY && ! defined $property->default_map) {
12565             $property->set_default_map('N');
12566         }
12567
12568         # Add any remaining code points to the mapping, using the default for
12569         # missing code points.
12570         my $default_table;
12571         if (defined (my $default_map = $property->default_map)) {
12572
12573             # Make sure there is a match table for the default
12574             if (! defined ($default_table = $property->table($default_map))) {
12575                 $default_table = $property->add_match_table($default_map);
12576             }
12577
12578             # And, if the property is binary, the default table will just
12579             # be the complement of the other table.
12580             if ($property_type == $BINARY) {
12581                 my $non_default_table;
12582
12583                 # Find the non-default table.
12584                 for my $table ($property->tables) {
12585                     next if $table == $default_table;
12586                     $non_default_table = $table;
12587                 }
12588                 $default_table->set_complement($non_default_table);
12589             }
12590             else {
12591
12592                 # This fills in any missing values with the default.  It's not
12593                 # necessary to do this with binary properties, as the default
12594                 # is defined completely in terms of the Y table.
12595                 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
12596                                    $default_map, Replace => $NO);
12597             }
12598         }
12599
12600         # Have all we need to populate the match tables.
12601         my $property_name = $property->name;
12602         my $maps_should_be_defined = $property->pre_declared_maps;
12603         foreach my $range ($property->ranges) {
12604             my $map = $range->value;
12605             my $table = $property->table($map);
12606             if (! defined $table) {
12607
12608                 # Integral and rational property values are not necessarily
12609                 # defined in PropValueAliases, but whether all the other ones
12610                 # should be depends on the property.
12611                 if ($maps_should_be_defined
12612                     && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
12613                 {
12614                     Carp::my_carp("Table '$property_name=$map' should have been defined.  Defining it now.")
12615                 }
12616                 $table = $property->add_match_table($map);
12617             }
12618
12619             next if $table->complement != 0;    # Don't need to populate these
12620             $table->add_range($range->start, $range->end);
12621         }
12622
12623         # A forced binary property has additional true/false tables which
12624         # should have been set up when it was forced into binary.  The false
12625         # table matches exactly the same set as the property's default table.
12626         # The true table matches the complement of that.  The false table is
12627         # not the same as an additional set of aliases on top of the default
12628         # table, so use 'set_equivalent_to'.  If it were implemented as
12629         # additional aliases, various things would have to be adjusted, but
12630         # especially, if the user wants to get a list of names for the table
12631         # using Unicode::UCD::prop_value_aliases(), s/he should get a
12632         # different set depending on whether they want the default table or
12633         # the false table.
12634         if ($property_type == $FORCED_BINARY) {
12635             $property->table('N')->set_equivalent_to($default_table,
12636                                                      Related => 1);
12637             $property->table('Y')->set_complement($default_table);
12638         }
12639
12640         # For Perl 5.6 compatibility, all properties matchable in regexes can
12641         # have an optional 'Is_' prefix.  This is now done in utf8_heavy.pl.
12642         # But warn if this creates a conflict with a (new) Unicode property
12643         # name, although it appears that Unicode has made a decision never to
12644         # begin a property name with 'Is_', so this shouldn't happen.
12645         foreach my $alias ($property->aliases) {
12646             my $Is_name = 'Is_' . $alias->name;
12647             if (defined (my $pre_existing = property_ref($Is_name))) {
12648                 Carp::my_carp(<<END
12649 There is already an alias named $Is_name (from " . $pre_existing . "), so
12650 creating one for $property won't work.  This is bad news.  If it is not too
12651 late, get Unicode to back off.  Otherwise go back to the old scheme (findable
12652 from the git blame log for this area of the code that suppressed individual
12653 aliases that conflict with the new Unicode names.  Proceeding anyway.
12654 END
12655                 );
12656             }
12657         } # End of loop through aliases for this property
12658     } # End of loop through all Unicode properties.
12659
12660     # Fill in the mappings that Unicode doesn't completely furnish.  First the
12661     # single letter major general categories.  If Unicode were to start
12662     # delivering the values, this would be redundant, but better that than to
12663     # try to figure out if should skip and not get it right.  Ths could happen
12664     # if a new major category were to be introduced, and the hard-coded test
12665     # wouldn't know about it.
12666     # This routine depends on the standard names for the general categories
12667     # being what it thinks they are, like 'Cn'.  The major categories are the
12668     # union of all the general category tables which have the same first
12669     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
12670     foreach my $minor_table ($gc->tables) {
12671         my $minor_name = $minor_table->name;
12672         next if length $minor_name == 1;
12673         if (length $minor_name != 2) {
12674             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
12675             next;
12676         }
12677
12678         my $major_name = uc(substr($minor_name, 0, 1));
12679         my $major_table = $gc->table($major_name);
12680         $major_table += $minor_table;
12681     }
12682
12683     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
12684     # defines it as LC)
12685     my $LC = $gc->table('LC');
12686     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
12687     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
12688
12689
12690     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
12691                          # deliver the correct values in it
12692         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
12693
12694         # Lt not in release 1.
12695         if (defined $gc->table('Lt')) {
12696             $LC += $gc->table('Lt');
12697             $gc->table('Lt')->set_caseless_equivalent($LC);
12698         }
12699     }
12700     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
12701
12702     $gc->table('Ll')->set_caseless_equivalent($LC);
12703     $gc->table('Lu')->set_caseless_equivalent($LC);
12704
12705     my $Cs = $gc->table('Cs');
12706
12707     # Create digit and case fold tables with the original file names for
12708     # backwards compatibility with applications that read them directly.
12709     my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
12710                               Default_Map => "",
12711                               Perl_Extension => 1,
12712                               File => 'Digit',    # Trad. location
12713                               Directory => $map_directory,
12714                               UCD => 0,
12715                               Type => $STRING,
12716                               To_Output_Map => $EXTERNAL_MAP,
12717                               Range_Size_1 => 1,
12718                               Initialize => property_ref('Perl_Decimal_Digit'),
12719                             );
12720     $Digit->add_comment(join_lines(<<END
12721 This file gives the mapping of all code points which represent a single
12722 decimal digit [0-9] to their respective digits.  For example, the code point
12723 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
12724 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
12725 numerals.
12726 END
12727     ));
12728
12729     Property->new('Legacy_Case_Folding',
12730                     File => "Fold",
12731                     Directory => $map_directory,
12732                     Default_Map => $CODE_POINT,
12733                     UCD => 0,
12734                     Range_Size_1 => 1,
12735                     Type => $STRING,
12736                     To_Output_Map => $EXTERNAL_MAP,
12737                     Format => $HEX_FORMAT,
12738                     Initialize => property_ref('cf'),
12739     );
12740
12741     # The Script_Extensions property started out as a clone of the Script
12742     # property.  But processing its data file caused some elements to be
12743     # replaced with different data.  (These elements were for the Common and
12744     # Inherited properties.)  This data is a qw() list of all the scripts that
12745     # the code points in the given range are in.  An example line is:
12746     # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
12747     #
12748     # The code above has created a new match table named "Arab Syrc Thaa"
12749     # which contains 060C.  (The cloned table started out with this code point
12750     # mapping to "Common".)  Now we add 060C to each of the Arab, Syrc, and
12751     # Thaa match tables.  Then we delete the now spurious "Arab Syrc Thaa"
12752     # match table.  This is repeated for all these tables and ranges.  The map
12753     # data is retained in the map table for reference, but the spurious match
12754     # tables are deleted.
12755
12756     my $scx = property_ref("Script_Extensions");
12757     if (defined $scx) {
12758         foreach my $table ($scx->tables) {
12759             next unless $table->name =~ /\s/;   # All the new and only the new
12760                                                 # tables have a space in their
12761                                                 # names
12762             my @scripts = split /\s+/, $table->name;
12763             foreach my $script (@scripts) {
12764                 my $script_table = $scx->table($script);
12765                 $script_table += $table;
12766             }
12767             $scx->delete_match_table($table);
12768         }
12769     }
12770
12771     return;
12772 }
12773
12774 sub pre_3_dot_1_Nl () {
12775
12776     # Return a range list for gc=nl for Unicode versions prior to 3.1, which
12777     # is when Unicode's became fully usable.  These code points were
12778     # determined by inspection and experimentation.  gc=nl is important for
12779     # certain Perl-extension properties that should be available in all
12780     # releases.
12781
12782     my $Nl = Range_List->new();
12783     if (defined (my $official = $gc->table('Nl'))) {
12784         $Nl += $official;
12785     }
12786     else {
12787         $Nl->add_range(0x2160, 0x2182);
12788         $Nl->add_range(0x3007, 0x3007);
12789         $Nl->add_range(0x3021, 0x3029);
12790     }
12791     $Nl->add_range(0xFE20, 0xFE23);
12792     $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
12793                                                             # these were added
12794     return $Nl;
12795 }
12796
12797 sub compile_perl() {
12798     # Create perl-defined tables.  Almost all are part of the pseudo-property
12799     # named 'perl' internally to this program.  Many of these are recommended
12800     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
12801     # on those found there.
12802     # Almost all of these are equivalent to some Unicode property.
12803     # A number of these properties have equivalents restricted to the ASCII
12804     # range, with their names prefaced by 'Posix', to signify that these match
12805     # what the Posix standard says they should match.  A couple are
12806     # effectively this, but the name doesn't have 'Posix' in it because there
12807     # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
12808     # to the full Unicode range, by our guesses as to what is appropriate.
12809
12810     # 'Any' is all code points.  As an error check, instead of just setting it
12811     # to be that, construct it to be the union of all the major categories
12812     $Any = $perl->add_match_table('Any',
12813             Description  => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
12814             Matches_All => 1);
12815
12816     foreach my $major_table ($gc->tables) {
12817
12818         # Major categories are the ones with single letter names.
12819         next if length($major_table->name) != 1;
12820
12821         $Any += $major_table;
12822     }
12823
12824     if ($Any->max != $MAX_UNICODE_CODEPOINT) {
12825         Carp::my_carp_bug("Generated highest code point ("
12826            . sprintf("%X", $Any->max)
12827            . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
12828     }
12829     if ($Any->range_count != 1 || $Any->min != 0) {
12830      Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
12831     }
12832
12833     $Any->add_alias('All');
12834
12835     # Assigned is the opposite of gc=unassigned
12836     my $Assigned = $perl->add_match_table('Assigned',
12837                                 Description  => "All assigned code points",
12838                                 Initialize => ~ $gc->table('Unassigned'),
12839                                 );
12840
12841     # Our internal-only property should be treated as more than just a
12842     # synonym; grandfather it in to the pod.
12843     $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
12844                             Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
12845             ->set_equivalent_to(property_ref('ccc')->table('Above'),
12846                                                                 Related => 1);
12847
12848     my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
12849     if (defined $block) {   # This is equivalent to the block if have it.
12850         my $Unicode_ASCII = $block->table('Basic_Latin');
12851         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
12852             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
12853         }
12854     }
12855
12856     # Very early releases didn't have blocks, so initialize ASCII ourselves if
12857     # necessary
12858     if ($ASCII->is_empty) {
12859         $ASCII->add_range(0, 127);
12860     }
12861
12862     # Get the best available case definitions.  Early Unicode versions didn't
12863     # have Uppercase and Lowercase defined, so use the general category
12864     # instead for them, modified by hard-coding in the code points each is
12865     # missing.
12866     my $Lower = $perl->add_match_table('Lower');
12867     my $Unicode_Lower = property_ref('Lowercase');
12868     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
12869         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
12870
12871     }
12872     else {
12873         $Lower += $gc->table('Lowercase_Letter');
12874
12875         # There are quite a few code points in Lower, that aren't in gc=lc,
12876         # and not all are in all releases.
12877         foreach my $code_point (    0x00AA,
12878                                     0x00BA,
12879                                     0x02B0 .. 0x02B8,
12880                                     0x02C0 .. 0x02C1,
12881                                     0x02E0 .. 0x02E4,
12882                                     0x0345,
12883                                     0x037A,
12884                                     0x1D2C .. 0x1D6A,
12885                                     0x1D78,
12886                                     0x1D9B .. 0x1DBF,
12887                                     0x2071,
12888                                     0x207F,
12889                                     0x2090 .. 0x209C,
12890                                     0x2170 .. 0x217F,
12891                                     0x24D0 .. 0x24E9,
12892                                     0x2C7C .. 0x2C7D,
12893                                     0xA770,
12894                                     0xA7F8 .. 0xA7F9,
12895         ) {
12896             # Don't include the code point unless it is assigned in this
12897             # release
12898             my $category = $gc->value_of(hex $code_point);
12899             next if ! defined $category || $category eq 'Cn';
12900
12901             $Lower += $code_point;
12902         }
12903     }
12904     $Lower->add_alias('XPosixLower');
12905     my $Posix_Lower = $perl->add_match_table("PosixLower",
12906                             Description => "[a-z]",
12907                             Initialize => $Lower & $ASCII,
12908                             );
12909
12910     my $Upper = $perl->add_match_table('Upper');
12911     my $Unicode_Upper = property_ref('Uppercase');
12912     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
12913         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
12914     }
12915     else {
12916
12917         # Unlike Lower, there are only two ranges in Upper that aren't in
12918         # gc=Lu, and all code points were assigned in all releases.
12919         $Upper += $gc->table('Uppercase_Letter');
12920         $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
12921         $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
12922     }
12923     $Upper->add_alias('XPosixUpper');
12924     my $Posix_Upper = $perl->add_match_table("PosixUpper",
12925                             Description => "[A-Z]",
12926                             Initialize => $Upper & $ASCII,
12927                             );
12928
12929     # Earliest releases didn't have title case.  Initialize it to empty if not
12930     # otherwise present
12931     my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
12932                                        Description => '(= \p{Gc=Lt})');
12933     my $lt = $gc->table('Lt');
12934
12935     # Earlier versions of mktables had this related to $lt since they have
12936     # identical code points, but their caseless equivalents are not the same,
12937     # one being 'Cased' and the other being 'LC', and so now must be kept as
12938     # separate entities.
12939     if (defined $lt) {
12940         $Title += $lt;
12941     }
12942     else {
12943         push @tables_that_may_be_empty, $Title->complete_name;
12944     }
12945
12946     my $Unicode_Cased = property_ref('Cased');
12947     if (defined $Unicode_Cased) {
12948         my $yes = $Unicode_Cased->table('Y');
12949         my $no = $Unicode_Cased->table('N');
12950         $Title->set_caseless_equivalent($yes);
12951         if (defined $Unicode_Upper) {
12952             $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
12953             $Unicode_Upper->table('N')->set_caseless_equivalent($no);
12954         }
12955         $Upper->set_caseless_equivalent($yes);
12956         if (defined $Unicode_Lower) {
12957             $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
12958             $Unicode_Lower->table('N')->set_caseless_equivalent($no);
12959         }
12960         $Lower->set_caseless_equivalent($yes);
12961     }
12962     else {
12963         # If this Unicode version doesn't have Cased, set up the Perl
12964         # extension from first principles.  From Unicode 5.1: Definition D120:
12965         # A character C is defined to be cased if and only if C has the
12966         # Lowercase or Uppercase property or has a General_Category value of
12967         # Titlecase_Letter.
12968         my $cased = $perl->add_match_table('Cased',
12969                         Initialize => $Lower + $Upper + $Title,
12970                         Description => 'Uppercase or Lowercase or Titlecase',
12971                         );
12972         # $notcased is purely for the caseless equivalents below
12973         my $notcased = $perl->add_match_table('_Not_Cased',
12974                                 Initialize => ~ $cased,
12975                                 Fate => $INTERNAL_ONLY,
12976                                 Description => 'All not-cased code points');
12977         $Title->set_caseless_equivalent($cased);
12978         if (defined $Unicode_Upper) {
12979             $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
12980             $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
12981         }
12982         $Upper->set_caseless_equivalent($cased);
12983         if (defined $Unicode_Lower) {
12984             $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
12985             $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
12986         }
12987         $Lower->set_caseless_equivalent($cased);
12988     }
12989
12990     # Similarly, set up our own Case_Ignorable property if this Unicode
12991     # version doesn't have it.  From Unicode 5.1: Definition D121: A character
12992     # C is defined to be case-ignorable if C has the value MidLetter or the
12993     # value MidNumLet for the Word_Break property or its General_Category is
12994     # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
12995     # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
12996
12997     # Perl has long had an internal-only alias for this property; grandfather
12998     # it in to the pod, but discourage its use.
12999     my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
13000                                                      Re_Pod_Entry => 1,
13001                                                      Fate => $INTERNAL_ONLY,
13002                                                      Status => $DISCOURAGED);
13003     my $case_ignorable = property_ref('Case_Ignorable');
13004     if (defined $case_ignorable && ! $case_ignorable->is_empty) {
13005         $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
13006                                                                 Related => 1);
13007     }
13008     else {
13009
13010         $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
13011
13012         # The following three properties are not in early releases
13013         $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
13014         $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
13015         $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
13016
13017         # For versions 4.1 - 5.0, there is no MidNumLet property, and
13018         # correspondingly the case-ignorable definition lacks that one.  For
13019         # 4.0, it appears that it was meant to be the same definition, but was
13020         # inadvertently omitted from the standard's text, so add it if the
13021         # property actually is there
13022         my $wb = property_ref('Word_Break');
13023         if (defined $wb) {
13024             my $midlet = $wb->table('MidLetter');
13025             $perl_case_ignorable += $midlet if defined $midlet;
13026             my $midnumlet = $wb->table('MidNumLet');
13027             $perl_case_ignorable += $midnumlet if defined $midnumlet;
13028         }
13029         else {
13030
13031             # In earlier versions of the standard, instead of the above two
13032             # properties , just the following characters were used:
13033             $perl_case_ignorable +=  0x0027  # APOSTROPHE
13034                                 +   0x00AD  # SOFT HYPHEN (SHY)
13035                                 +   0x2019; # RIGHT SINGLE QUOTATION MARK
13036         }
13037     }
13038
13039     # The remaining perl defined tables are mostly based on Unicode TR 18,
13040     # "Annex C: Compatibility Properties".  All of these have two versions,
13041     # one whose name generally begins with Posix that is posix-compliant, and
13042     # one that matches Unicode characters beyond the Posix, ASCII range
13043
13044     my $Alpha = $perl->add_match_table('Alpha');
13045
13046     # Alphabetic was not present in early releases
13047     my $Alphabetic = property_ref('Alphabetic');
13048     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
13049         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
13050     }
13051     else {
13052
13053         # The Alphabetic property doesn't exist for early releases, so
13054         # generate it.  The actual definition, in 5.2 terms is:
13055         #
13056         # gc=L + gc=Nl + Other_Alphabetic
13057         #
13058         # Other_Alphabetic is also not defined in these early releases, but it
13059         # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
13060         # those last two as well, then subtract the relatively few of them that
13061         # shouldn't have been added.  (The gc=So range is the circled capital
13062         # Latin characters.  Early releases mistakenly didn't also include the
13063         # lower-case versions of these characters, and so we don't either, to
13064         # maintain consistency with those releases that first had this
13065         # property.
13066         $Alpha->initialize($gc->table('Letter')
13067                            + pre_3_dot_1_Nl()
13068                            + $gc->table('Mn')
13069                            + $gc->table('Mc')
13070                         );
13071         $Alpha->add_range(0x24D0, 0x24E9);  # gc=So
13072         foreach my $range (     [ 0x0300, 0x0344 ],
13073                                 [ 0x0346, 0x034E ],
13074                                 [ 0x0360, 0x0362 ],
13075                                 [ 0x0483, 0x0486 ],
13076                                 [ 0x0591, 0x05AF ],
13077                                 [ 0x06DF, 0x06E0 ],
13078                                 [ 0x06EA, 0x06EC ],
13079                                 [ 0x0740, 0x074A ],
13080                                 0x093C,
13081                                 0x094D,
13082                                 [ 0x0951, 0x0954 ],
13083                                 0x09BC,
13084                                 0x09CD,
13085                                 0x0A3C,
13086                                 0x0A4D,
13087                                 0x0ABC,
13088                                 0x0ACD,
13089                                 0x0B3C,
13090                                 0x0B4D,
13091                                 0x0BCD,
13092                                 0x0C4D,
13093                                 0x0CCD,
13094                                 0x0D4D,
13095                                 0x0DCA,
13096                                 [ 0x0E47, 0x0E4C ],
13097                                 0x0E4E,
13098                                 [ 0x0EC8, 0x0ECC ],
13099                                 [ 0x0F18, 0x0F19 ],
13100                                 0x0F35,
13101                                 0x0F37,
13102                                 0x0F39,
13103                                 [ 0x0F3E, 0x0F3F ],
13104                                 [ 0x0F82, 0x0F84 ],
13105                                 [ 0x0F86, 0x0F87 ],
13106                                 0x0FC6,
13107                                 0x1037,
13108                                 0x1039,
13109                                 [ 0x17C9, 0x17D3 ],
13110                                 [ 0x20D0, 0x20DC ],
13111                                 0x20E1,
13112                                 [ 0x302A, 0x302F ],
13113                                 [ 0x3099, 0x309A ],
13114                                 [ 0xFE20, 0xFE23 ],
13115                                 [ 0x1D165, 0x1D169 ],
13116                                 [ 0x1D16D, 0x1D172 ],
13117                                 [ 0x1D17B, 0x1D182 ],
13118                                 [ 0x1D185, 0x1D18B ],
13119                                 [ 0x1D1AA, 0x1D1AD ],
13120         ) {
13121             if (ref $range) {
13122                 $Alpha->delete_range($range->[0], $range->[1]);
13123             }
13124             else {
13125                 $Alpha->delete_range($range, $range);
13126             }
13127         }
13128         $Alpha->add_description('Alphabetic');
13129         $Alpha->add_alias('Alphabetic');
13130     }
13131     $Alpha->add_alias('XPosixAlpha');
13132     my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
13133                             Description => "[A-Za-z]",
13134                             Initialize => $Alpha & $ASCII,
13135                             );
13136     $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
13137     $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
13138
13139     my $Alnum = $perl->add_match_table('Alnum',
13140                         Description => 'Alphabetic and (decimal) Numeric',
13141                         Initialize => $Alpha + $gc->table('Decimal_Number'),
13142                         );
13143     $Alnum->add_alias('XPosixAlnum');
13144     $perl->add_match_table("PosixAlnum",
13145                             Description => "[A-Za-z0-9]",
13146                             Initialize => $Alnum & $ASCII,
13147                             );
13148
13149     my $Word = $perl->add_match_table('Word',
13150                                 Description => '\w, including beyond ASCII;'
13151                                             . ' = \p{Alnum} + \pM + \p{Pc}',
13152                                 Initialize => $Alnum + $gc->table('Mark'),
13153                                 );
13154     $Word->add_alias('XPosixWord');
13155     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
13156     if (defined $Pc) {
13157         $Word += $Pc;
13158     }
13159     else {
13160         $Word += ord('_');  # Make sure this is a $Word
13161     }
13162     my $JC = property_ref('Join_Control');  # Wasn't in release 1
13163     if (defined $JC) {
13164         $Word += $JC->table('Y');
13165     }
13166     else {
13167         $Word += 0x200C + 0x200D;
13168     }
13169
13170     # This is a Perl extension, so the name doesn't begin with Posix.
13171     my $PerlWord = $perl->add_match_table('PerlWord',
13172                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
13173                     Initialize => $Word & $ASCII,
13174                     );
13175     $PerlWord->add_alias('PosixWord');
13176
13177     my $Blank = $perl->add_match_table('Blank',
13178                                 Description => '\h, Horizontal white space',
13179
13180                                 # 200B is Zero Width Space which is for line
13181                                 # break control, and was listed as
13182                                 # Space_Separator in early releases
13183                                 Initialize => $gc->table('Space_Separator')
13184                                             +   0x0009  # TAB
13185                                             -   0x200B, # ZWSP
13186                                 );
13187     $Blank->add_alias('HorizSpace');        # Another name for it.
13188     $Blank->add_alias('XPosixBlank');
13189     $perl->add_match_table("PosixBlank",
13190                             Description => "\\t and ' '",
13191                             Initialize => $Blank & $ASCII,
13192                             );
13193
13194     my $VertSpace = $perl->add_match_table('VertSpace',
13195                             Description => '\v',
13196                             Initialize => $gc->table('Line_Separator')
13197                                         + $gc->table('Paragraph_Separator')
13198                                         + 0x000A  # LINE FEED
13199                                         + 0x000B  # VERTICAL TAB
13200                                         + 0x000C  # FORM FEED
13201                                         + 0x000D  # CARRIAGE RETURN
13202                                         + 0x0085, # NEL
13203                             );
13204     # No Posix equivalent for vertical space
13205
13206     my $Space = $perl->add_match_table('Space',
13207                 Description => '\s including beyond ASCII plus vertical tab',
13208                 Initialize => $Blank + $VertSpace,
13209     );
13210     $Space->add_alias('XPosixSpace');
13211     $perl->add_match_table("PosixSpace",
13212                             Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
13213                             Initialize => $Space & $ASCII,
13214                             );
13215
13216     # Perl's traditional space doesn't include Vertical Tab
13217     my $XPerlSpace = $perl->add_match_table('XPerlSpace',
13218                                   Description => '\s, including beyond ASCII',
13219                                   #Initialize => $Space - 0x000B,
13220                                   Initialize => $Space,
13221                                 );
13222     $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
13223     my $PerlSpace = $perl->add_match_table('PerlSpace',
13224                         Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
13225                         Initialize => $XPerlSpace & $ASCII,
13226                             );
13227
13228
13229     my $Cntrl = $perl->add_match_table('Cntrl',
13230                                         Description => 'Control characters');
13231     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
13232     $Cntrl->add_alias('XPosixCntrl');
13233     $perl->add_match_table("PosixCntrl",
13234                             Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
13235                             Initialize => $Cntrl & $ASCII,
13236                             );
13237
13238     # $controls is a temporary used to construct Graph.
13239     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
13240                                                 + $gc->table('Control'));
13241     # Cs not in release 1
13242     $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
13243
13244     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
13245     my $Graph = $perl->add_match_table('Graph',
13246                         Description => 'Characters that are graphical',
13247                         Initialize => ~ ($Space + $controls),
13248                         );
13249     $Graph->add_alias('XPosixGraph');
13250     $perl->add_match_table("PosixGraph",
13251                             Description =>
13252                                 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
13253                             Initialize => $Graph & $ASCII,
13254                             );
13255
13256     $print = $perl->add_match_table('Print',
13257                         Description => 'Characters that are graphical plus space characters (but no controls)',
13258                         Initialize => $Blank + $Graph - $gc->table('Control'),
13259                         );
13260     $print->add_alias('XPosixPrint');
13261     $perl->add_match_table("PosixPrint",
13262                             Description =>
13263                               '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
13264                             Initialize => $print & $ASCII,
13265                             );
13266
13267     my $Punct = $perl->add_match_table('Punct');
13268     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
13269
13270     # \p{punct} doesn't include the symbols, which posix does
13271     my $XPosixPunct = $perl->add_match_table('XPosixPunct',
13272                     Description => '\p{Punct} + ASCII-range \p{Symbol}',
13273                     Initialize => $gc->table('Punctuation')
13274                                 + ($ASCII & $gc->table('Symbol')),
13275                                 Perl_Extension => 1
13276         );
13277     $perl->add_match_table('PosixPunct', Perl_Extension => 1,
13278         Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
13279         Initialize => $ASCII & $XPosixPunct,
13280         );
13281
13282     my $Digit = $perl->add_match_table('Digit',
13283                             Description => '[0-9] + all other decimal digits');
13284     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
13285     $Digit->add_alias('XPosixDigit');
13286     my $PosixDigit = $perl->add_match_table("PosixDigit",
13287                                             Description => '[0-9]',
13288                                             Initialize => $Digit & $ASCII,
13289                                             );
13290
13291     # Hex_Digit was not present in first release
13292     my $Xdigit = $perl->add_match_table('XDigit');
13293     $Xdigit->add_alias('XPosixXDigit');
13294     my $Hex = property_ref('Hex_Digit');
13295     if (defined $Hex && ! $Hex->is_empty) {
13296         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
13297     }
13298     else {
13299         # (Have to use hex instead of e.g. '0', because could be running on an
13300         # non-ASCII machine, and we want the Unicode (ASCII) values)
13301         $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
13302                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
13303         $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
13304     }
13305
13306     # AHex was not present in early releases
13307     my $PosixXDigit = $perl->add_match_table('PosixXDigit');
13308     my $AHex = property_ref('ASCII_Hex_Digit');
13309     if (defined $AHex && ! $AHex->is_empty) {
13310         $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
13311     }
13312     else {
13313         $PosixXDigit->initialize($Xdigit & $ASCII);
13314         $PosixXDigit->add_alias('AHex');
13315         $PosixXDigit->add_alias('Ascii_Hex_Digit');
13316     }
13317     $PosixXDigit->add_description('[0-9A-Fa-f]');
13318
13319     my $dt = property_ref('Decomposition_Type');
13320     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
13321         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
13322         Perl_Extension => 1,
13323         Note => 'Union of all non-canonical decompositions',
13324         );
13325
13326     # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
13327     # than SD appeared, construct it ourselves, based on the first release SD
13328     # was in.  A pod entry is grandfathered in for it
13329     my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
13330                                            Perl_Extension => 1,
13331                                            Fate => $INTERNAL_ONLY,
13332                                            Status => $DISCOURAGED);
13333     my $soft_dotted = property_ref('Soft_Dotted');
13334     if (defined $soft_dotted && ! $soft_dotted->is_empty) {
13335         $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
13336     }
13337     else {
13338
13339         # This list came from 3.2 Soft_Dotted; all of these code points are in
13340         # all releases
13341         $CanonDCIJ->initialize([ 0x0069,
13342                                  0x006A,
13343                                  0x012F,
13344                                  0x0268,
13345                                  0x0456,
13346                                  0x0458,
13347                                  0x1E2D,
13348                                  0x1ECB,
13349                                ]);
13350         $CanonDCIJ = $CanonDCIJ & $Assigned;
13351     }
13352
13353     # For backward compatibility, Perl has its own definition for IDStart.
13354     # It is regular XID_Start plus the underscore, but all characters must be
13355     # Word characters as well
13356     my $XID_Start = property_ref('XID_Start');
13357     my $perl_xids = $perl->add_match_table('_Perl_IDStart',
13358                                             Perl_Extension => 1,
13359                                             Fate => $INTERNAL_ONLY,
13360                                             Initialize => ord('_')
13361                                             );
13362     if (defined $XID_Start
13363         || defined ($XID_Start = property_ref('ID_Start')))
13364     {
13365         $perl_xids += $XID_Start->table('Y');
13366     }
13367     else {
13368         # For Unicode versions that don't have the property, construct our own
13369         # from first principles.  The actual definition is:
13370         #     Letters
13371         #   + letter numbers (Nl)
13372         #   - Pattern_Syntax
13373         #   - Pattern_White_Space
13374         #   + stability extensions
13375         #   - NKFC modifications
13376         #
13377         # What we do in the code below is to include the identical code points
13378         # that are in the first release that had Unicode's version of this
13379         # property, essentially extrapolating backwards.  There were no
13380         # stability extensions until v4.1, so none are included; likewise in
13381         # no Unicode version so far do subtracting PatSyn and PatWS make any
13382         # difference, so those also are ignored.
13383         $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
13384
13385         # We do subtract the NFKC modifications that are in the first version
13386         # that had this property.  We don't bother to test if they are in the
13387         # version in question, because if they aren't, the operation is a
13388         # no-op.  The NKFC modifications are discussed in
13389         # http://www.unicode.org/reports/tr31/#NFKC_Modifications
13390         foreach my $range ( 0x037A,
13391                             0x0E33,
13392                             0x0EB3,
13393                             [ 0xFC5E, 0xFC63 ],
13394                             [ 0xFDFA, 0xFE70 ],
13395                             [ 0xFE72, 0xFE76 ],
13396                             0xFE78,
13397                             0xFE7A,
13398                             0xFE7C,
13399                             0xFE7E,
13400                             [ 0xFF9E, 0xFF9F ],
13401         ) {
13402             if (ref $range) {
13403                 $perl_xids->delete_range($range->[0], $range->[1]);
13404             }
13405             else {
13406                 $perl_xids->delete_range($range, $range);
13407             }
13408         }
13409     }
13410
13411     $perl_xids &= $Word;
13412
13413     my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
13414                                         Perl_Extension => 1,
13415                                         Fate => $INTERNAL_ONLY);
13416     my $XIDC = property_ref('XID_Continue');
13417     if (defined $XIDC
13418         || defined ($XIDC = property_ref('ID_Continue')))
13419     {
13420         $perl_xidc += $XIDC->table('Y');
13421     }
13422     else {
13423         # Similarly, we construct our own XIDC if necessary for early Unicode
13424         # versions.  The definition is:
13425         #     everything in XIDS
13426         #   + Gc=Mn
13427         #   + Gc=Mc
13428         #   + Gc=Nd
13429         #   + Gc=Pc
13430         #   - Pattern_Syntax
13431         #   - Pattern_White_Space
13432         #   + stability extensions
13433         #   - NFKC modifications
13434         #
13435         # The same thing applies to this as with XIDS for the PatSyn, PatWS,
13436         # and stability extensions.  There is a somewhat different set of NFKC
13437         # mods to remove (and add in this case).  The ones below make this
13438         # have identical code points as in the first release that defined it.
13439         $perl_xidc += $perl_xids
13440                     + $gc->table('L')
13441                     + $gc->table('Mn')
13442                     + $gc->table('Mc')
13443                     + $gc->table('Nd')
13444                     + 0x00B7
13445                     ;
13446         if (defined (my $pc = $gc->table('Pc'))) {
13447             $perl_xidc += $pc;
13448         }
13449         else {  # 1.1.5 didn't have Pc, but these should have been in it
13450             $perl_xidc += 0xFF3F;
13451             $perl_xidc->add_range(0x203F, 0x2040);
13452             $perl_xidc->add_range(0xFE33, 0xFE34);
13453             $perl_xidc->add_range(0xFE4D, 0xFE4F);
13454         }
13455
13456         # Subtract the NFKC mods
13457         foreach my $range ( 0x037A,
13458                             [ 0xFC5E, 0xFC63 ],
13459                             [ 0xFDFA, 0xFE1F ],
13460                             0xFE70,
13461                             [ 0xFE72, 0xFE76 ],
13462                             0xFE78,
13463                             0xFE7A,
13464                             0xFE7C,
13465                             0xFE7E,
13466         ) {
13467             if (ref $range) {
13468                 $perl_xidc->delete_range($range->[0], $range->[1]);
13469             }
13470             else {
13471                 $perl_xidc->delete_range($range, $range);
13472             }
13473         }
13474     }
13475
13476     $perl_xidc &= $Word;
13477
13478     # These two tables are for the 'extended' grapheme cluster, which came in
13479     # 5.1; create empty ones if not already present.  The non-extended
13480     # definition differs from the extended (see
13481     # http://www.unicode.org/reports/tr29/) only by these two tables, so we
13482     # get the older definition automatically when they are empty.
13483     my $gcb = property_ref('Grapheme_Cluster_Break');
13484     my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
13485                                         Perl_Extension => 1,
13486                                         Fate => $INTERNAL_ONLY);
13487     if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
13488         $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
13489     }
13490     else {
13491         push @tables_that_may_be_empty, $perl_prepend->complete_name;
13492     }
13493
13494
13495     # These are used in Unicode's definition of \X
13496     my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
13497                                        Fate => $INTERNAL_ONLY);
13498     my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
13499                                         Fate => $INTERNAL_ONLY);
13500
13501     # In the line below, two negatives means: yes hangul
13502     $begin += ~ property_ref('Hangul_Syllable_Type')
13503                                                 ->table('Not_Applicable')
13504             + ~ ($gcb->table('Control')
13505                 + $gcb->table('CR')
13506                 + $gcb->table('LF'));
13507     $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
13508
13509     $extend += $gcb->table('Extend');
13510     if (defined (my $sm = $gcb->table('SpacingMark'))) {
13511         $extend += $sm;
13512     }
13513     $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
13514
13515     # More GCB.  Populate a combined hangul syllables table
13516     my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
13517                                           Perl_Extension => 1,
13518                                           Fate => $INTERNAL_ONLY);
13519     foreach my $gcb_name (qw{ L V T LV LVT }) {
13520
13521         # The perl internal extension's name is the gcb table name prepended
13522         # with an '_X_'
13523         my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
13524                                         Perl_Extension => 1,
13525                                         Fate => $INTERNAL_ONLY,
13526                                         Initialize => $gcb->table($gcb_name),
13527                                         );
13528         # Version 1 had mostly different Hangul syllables that were removed
13529         # from later versions, so some of the tables may not apply.
13530         if ($v_version lt v2.0) {
13531             push @tables_that_may_be_empty, $perl_table->complete_name;
13532         }
13533     }
13534     $perl->add_match_table('_X_HST_Not_Applicable',
13535                             Perl_Extension => 1,
13536                             Fate => $INTERNAL_ONLY,
13537                             Initialize => property_ref('HST')->table('NA'),
13538                             );
13539     $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
13540     $lv_lvt_v->add_comment('For use in \X; matches: hst=LV | hst=LVT | hst=V');
13541
13542     my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
13543
13544     if (@named_sequences) {
13545         push @composition, 'Named_Sequence';
13546         foreach my $sequence (@named_sequences) {
13547             $perl_charname->add_anomalous_entry($sequence);
13548         }
13549     }
13550
13551     my $alias_sentence = "";
13552     my %abbreviations;
13553     my $alias = property_ref('Name_Alias');
13554     $perl_charname->set_proxy_for('Name_Alias');
13555
13556     # Add each entry in Name_Alias to Perl_Charnames.  Where these go with
13557     # respect to any existing entry depends on the entry type.  Corrections go
13558     # before said entry, as they should be returned in preference over the
13559     # existing entry.  (A correction to a correction should be later in the
13560     # Name_Alias table, so it will correctly precede the erroneous correction
13561     # in Perl_Charnames.)
13562     #
13563     # Abbreviations go after everything else, so they are saved temporarily in
13564     # a hash for later.
13565     #
13566     # Everything else is added added afterwards, which preserves the input
13567     # ordering
13568
13569     foreach my $range ($alias->ranges) {
13570         next if $range->value eq "";
13571         my $code_point = $range->start;
13572         if ($code_point != $range->end) {
13573             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
13574         }
13575         my ($value, $type) = split ': ', $range->value;
13576         my $replace_type;
13577         if ($type eq 'correction') {
13578             $replace_type = $MULTIPLE_BEFORE;
13579         }
13580         elsif ($type eq 'abbreviation') {
13581
13582             # Save for later
13583             $abbreviations{$value} = $code_point;
13584             next;
13585         }
13586         else {
13587             $replace_type = $MULTIPLE_AFTER;
13588         }
13589
13590         # Actually add; before or after current entry(ies) as determined
13591         # above.
13592
13593         $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
13594     }
13595     $alias_sentence = <<END;
13596 The Name_Alias property adds duplicate code point entries that are
13597 alternatives to the original name.  If an addition is a corrected
13598 name, it will be physically first in the table.  The original (less correct,
13599 but still valid) name will be next; then any alternatives, in no particular
13600 order; and finally any abbreviations, again in no particular order.
13601 END
13602
13603     # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
13604     # precedence before 6.1, so should be first in the file; the other names
13605     # have precedence starting in 6.1,
13606     my $before_or_after = ($v_version lt v6.1.0)
13607                           ? $MULTIPLE_BEFORE
13608                           : $MULTIPLE_AFTER;
13609
13610     foreach my $range (property_ref('Unicode_1_Name')->ranges) {
13611         my $code_point = $range->start;
13612         my $unicode_1_value = $range->value;
13613         next if $unicode_1_value eq "";     # Skip if name doesn't exist.
13614
13615         if ($code_point != $range->end) {
13616             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
13617         }
13618
13619         # To handle EBCDIC, we don't hard code in the code points of the
13620         # controls; instead realizing that all of them are below 256.
13621         last if $code_point > 255;
13622
13623         # We only add in the controls.
13624         next if $gc->value_of($code_point) ne 'Cc';
13625
13626         # We reject this Unicode1 name for later Perls, as it is used for
13627         # another code point
13628         next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
13629
13630         # This won't add an exact duplicate.
13631         $perl_charname->add_duplicate($code_point, $unicode_1_value,
13632                                         Replace => $before_or_after);
13633     }
13634
13635     # But in this version only, the ALERT has precedence over BELL, the
13636     # Unicode_1_Name that would otherwise have precedence.
13637     if ($v_version eq v6.0.0) {
13638         $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE);
13639     }
13640
13641     # Now that have everything added, add in abbreviations after
13642     # everything else.
13643     foreach my $value (keys %abbreviations) {
13644         $perl_charname->add_duplicate($abbreviations{$value}, $value,
13645                                         Replace => $MULTIPLE_AFTER);
13646     }
13647
13648     my $comment;
13649     if (@composition <= 2) { # Always at least 2
13650         $comment = join " and ", @composition;
13651     }
13652     else {
13653         $comment = join ", ", @composition[0 .. scalar @composition - 2];
13654         $comment .= ", and $composition[-1]";
13655     }
13656
13657     $perl_charname->add_comment(join_lines( <<END
13658 This file is for charnames.pm.  It is the union of the $comment properties.
13659 Unicode_1_Name entries are used only for nameless code points in the Name
13660 property.
13661 $alias_sentence
13662 This file doesn't include the algorithmically determinable names.  For those,
13663 use 'unicore/Name.pm'
13664 END
13665     ));
13666     property_ref('Name')->add_comment(join_lines( <<END
13667 This file doesn't include the algorithmically determinable names.  For those,
13668 use 'unicore/Name.pm'
13669 END
13670     ));
13671
13672     # Construct the Present_In property from the Age property.
13673     if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
13674         my $default_map = $age->default_map;
13675         my $in = Property->new('In',
13676                                 Default_Map => $default_map,
13677                                 Full_Name => "Present_In",
13678                                 Perl_Extension => 1,
13679                                 Type => $ENUM,
13680                                 Initialize => $age,
13681                                 );
13682         $in->add_comment(join_lines(<<END
13683 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
13684 same as for $age, and not for what $in really means.  This is because anything
13685 defined in a given release should have multiple values: that release and all
13686 higher ones.  But only one value per code point can be represented in a table
13687 like this.
13688 END
13689         ));
13690
13691         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
13692         # lowest numbered (earliest) come first, with the non-numeric one
13693         # last.
13694         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
13695                                             ? 1
13696                                             : ($b->name !~ /^[\d.]*$/)
13697                                                 ? -1
13698                                                 : $a->name <=> $b->name
13699                                             } $age->tables;
13700
13701         # The Present_In property is the cumulative age properties.  The first
13702         # one hence is identical to the first age one.
13703         my $previous_in = $in->add_match_table($first_age->name);
13704         $previous_in->set_equivalent_to($first_age, Related => 1);
13705
13706         my $description_start = "Code point's usage introduced in version ";
13707         $first_age->add_description($description_start . $first_age->name);
13708
13709         # To construct the accumulated values, for each of the age tables
13710         # starting with the 2nd earliest, merge the earliest with it, to get
13711         # all those code points existing in the 2nd earliest.  Repeat merging
13712         # the new 2nd earliest with the 3rd earliest to get all those existing
13713         # in the 3rd earliest, and so on.
13714         foreach my $current_age (@rest_ages) {
13715             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
13716
13717             my $current_in = $in->add_match_table(
13718                                     $current_age->name,
13719                                     Initialize => $current_age + $previous_in,
13720                                     Description => $description_start
13721                                                     . $current_age->name
13722                                                     . ' or earlier',
13723                                     );
13724             $previous_in = $current_in;
13725
13726             # Add clarifying material for the corresponding age file.  This is
13727             # in part because of the confusing and contradictory information
13728             # given in the Standard's documentation itself, as of 5.2.
13729             $current_age->add_description(
13730                             "Code point's usage was introduced in version "
13731                             . $current_age->name);
13732             $current_age->add_note("See also $in");
13733
13734         }
13735
13736         # And finally the code points whose usages have yet to be decided are
13737         # the same in both properties.  Note that permanently unassigned code
13738         # points actually have their usage assigned (as being permanently
13739         # unassigned), so that these tables are not the same as gc=cn.
13740         my $unassigned = $in->add_match_table($default_map);
13741         my $age_default = $age->table($default_map);
13742         $age_default->add_description(<<END
13743 Code point's usage has not been assigned in any Unicode release thus far.
13744 END
13745         );
13746         $unassigned->set_equivalent_to($age_default, Related => 1);
13747     }
13748
13749     # See L<perlfunc/quotemeta>
13750     my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
13751                                            Perl_Extension => 1,
13752                                            Fate => $INTERNAL_ONLY,
13753
13754                                            # Initialize to what's common in
13755                                            # all Unicode releases.
13756                                            Initialize =>
13757                                                 $Space
13758                                                 + $gc->table('Control')
13759                            );
13760
13761     # In early releases without the proper Unicode properties, just set to \W.
13762     if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
13763         || ! defined (my $patws = property_ref('Pattern_White_Space'))
13764         || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
13765     {
13766         $quotemeta += ~ $Word;
13767     }
13768     else {
13769         $quotemeta += $patsyn->table('Y')
13770                    + $patws->table('Y')
13771                    + $di->table('Y')
13772                    + ((~ $Word) & $ASCII);
13773     }
13774
13775     # Finished creating all the perl properties.  All non-internal non-string
13776     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
13777     # an underscore.)  These do not get a separate entry in the pod file
13778     foreach my $table ($perl->tables) {
13779         foreach my $alias ($table->aliases) {
13780             next if $alias->name =~ /^_/;
13781             $table->add_alias('Is_' . $alias->name,
13782                                Re_Pod_Entry => 0,
13783                                UCD => 0,
13784                                Status => $alias->status,
13785                                OK_as_Filename => 0);
13786         }
13787     }
13788
13789     # Here done with all the basic stuff.  Ready to populate the information
13790     # about each character if annotating them.
13791     if ($annotate) {
13792
13793         # See comments at its declaration
13794         $annotate_ranges = Range_Map->new;
13795
13796         # This separates out the non-characters from the other unassigneds, so
13797         # can give different annotations for each.
13798         $unassigned_sans_noncharacters = Range_List->new(
13799                                     Initialize => $gc->table('Unassigned'));
13800         if (defined (my $nonchars = property_ref('Noncharacter_Code_Point'))) {
13801             $unassigned_sans_noncharacters &= $nonchars->table('N');
13802         }
13803
13804         for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
13805             $i = populate_char_info($i);    # Note sets $i so may cause skips
13806         }
13807     }
13808
13809     return;
13810 }
13811
13812 sub add_perl_synonyms() {
13813     # A number of Unicode tables have Perl synonyms that are expressed in
13814     # the single-form, \p{name}.  These are:
13815     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
13816     #       \p{Is_Name} as synonyms
13817     #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
13818     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
13819     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
13820     #       conflict, \p{Value} and \p{Is_Value} as well
13821     #
13822     # This routine generates these synonyms, warning of any unexpected
13823     # conflicts.
13824
13825     # Construct the list of tables to get synonyms for.  Start with all the
13826     # binary and the General_Category ones.
13827     my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
13828                                                             property_ref('*');
13829     push @tables, $gc->tables;
13830
13831     # If the version of Unicode includes the Script property, add its tables
13832     push @tables, $script->tables if defined $script;
13833
13834     # The Block tables are kept separate because they are treated differently.
13835     # And the earliest versions of Unicode didn't include them, so add only if
13836     # there are some.
13837     my @blocks;
13838     push @blocks, $block->tables if defined $block;
13839
13840     # Here, have the lists of tables constructed.  Process blocks last so that
13841     # if there are name collisions with them, blocks have lowest priority.
13842     # Should there ever be other collisions, manual intervention would be
13843     # required.  See the comments at the beginning of the program for a
13844     # possible way to handle those semi-automatically.
13845     foreach my $table (@tables,  @blocks) {
13846
13847         # For non-binary properties, the synonym is just the name of the
13848         # table, like Greek, but for binary properties the synonym is the name
13849         # of the property, and means the code points in its 'Y' table.
13850         my $nominal = $table;
13851         my $nominal_property = $nominal->property;
13852         my $actual;
13853         if (! $nominal->isa('Property')) {
13854             $actual = $table;
13855         }
13856         else {
13857
13858             # Here is a binary property.  Use the 'Y' table.  Verify that is
13859             # there
13860             my $yes = $nominal->table('Y');
13861             unless (defined $yes) {  # Must be defined, but is permissible to
13862                                      # be empty.
13863                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
13864                 next;
13865             }
13866             $actual = $yes;
13867         }
13868
13869         foreach my $alias ($nominal->aliases) {
13870
13871             # Attempt to create a table in the perl directory for the
13872             # candidate table, using whatever aliases in it that don't
13873             # conflict.  Also add non-conflicting aliases for all these
13874             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
13875             PREFIX:
13876             foreach my $prefix ("", 'Is_', 'In_') {
13877
13878                 # Only Block properties can have added 'In_' aliases.
13879                 next if $prefix eq 'In_' and $nominal_property != $block;
13880
13881                 my $proposed_name = $prefix . $alias->name;
13882
13883                 # No Is_Is, In_In, nor combinations thereof
13884                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
13885                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
13886
13887                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
13888
13889                 # Get a reference to any existing table in the perl
13890                 # directory with the desired name.
13891                 my $pre_existing = $perl->table($proposed_name);
13892
13893                 if (! defined $pre_existing) {
13894
13895                     # No name collision, so ok to add the perl synonym.
13896
13897                     my $make_re_pod_entry;
13898                     my $ok_as_filename;
13899                     my $status = $alias->status;
13900                     if ($nominal_property == $block) {
13901
13902                         # For block properties, the 'In' form is preferred for
13903                         # external use; the pod file contains wild cards for
13904                         # this and the 'Is' form so no entries for those; and
13905                         # we don't want people using the name without the
13906                         # 'In', so discourage that.
13907                         if ($prefix eq "") {
13908                             $make_re_pod_entry = 1;
13909                             $status = $status || $DISCOURAGED;
13910                             $ok_as_filename = 0;
13911                         }
13912                         elsif ($prefix eq 'In_') {
13913                             $make_re_pod_entry = 0;
13914                             $status = $status || $NORMAL;
13915                             $ok_as_filename = 1;
13916                         }
13917                         else {
13918                             $make_re_pod_entry = 0;
13919                             $status = $status || $DISCOURAGED;
13920                             $ok_as_filename = 0;
13921                         }
13922                     }
13923                     elsif ($prefix ne "") {
13924
13925                         # The 'Is' prefix is handled in the pod by a wild
13926                         # card, and we won't use it for an external name
13927                         $make_re_pod_entry = 0;
13928                         $status = $status || $NORMAL;
13929                         $ok_as_filename = 0;
13930                     }
13931                     else {
13932
13933                         # Here, is an empty prefix, non block.  This gets its
13934                         # own pod entry and can be used for an external name.
13935                         $make_re_pod_entry = 1;
13936                         $status = $status || $NORMAL;
13937                         $ok_as_filename = 1;
13938                     }
13939
13940                     # Here, there isn't a perl pre-existing table with the
13941                     # name.  Look through the list of equivalents of this
13942                     # table to see if one is a perl table.
13943                     foreach my $equivalent ($actual->leader->equivalents) {
13944                         next if $equivalent->property != $perl;
13945
13946                         # Here, have found a table for $perl.  Add this alias
13947                         # to it, and are done with this prefix.
13948                         $equivalent->add_alias($proposed_name,
13949                                         Re_Pod_Entry => $make_re_pod_entry,
13950
13951                                         # Currently don't output these in the
13952                                         # ucd pod, as are strongly discouraged
13953                                         # from being used
13954                                         UCD => 0,
13955
13956                                         Status => $status,
13957                                         OK_as_Filename => $ok_as_filename);
13958                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
13959                         next PREFIX;
13960                     }
13961
13962                     # Here, $perl doesn't already have a table that is a
13963                     # synonym for this property, add one.
13964                     my $added_table = $perl->add_match_table($proposed_name,
13965                                             Re_Pod_Entry => $make_re_pod_entry,
13966
13967                                             # See UCD comment just above
13968                                             UCD => 0,
13969
13970                                             Status => $status,
13971                                             OK_as_Filename => $ok_as_filename);
13972                     # And it will be related to the actual table, since it is
13973                     # based on it.
13974                     $added_table->set_equivalent_to($actual, Related => 1);
13975                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
13976                     next;
13977                 } # End of no pre-existing.
13978
13979                 # Here, there is a pre-existing table that has the proposed
13980                 # name.  We could be in trouble, but not if this is just a
13981                 # synonym for another table that we have already made a child
13982                 # of the pre-existing one.
13983                 if ($pre_existing->is_set_equivalent_to($actual)) {
13984                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
13985                     $pre_existing->add_alias($proposed_name);
13986                     next;
13987                 }
13988
13989                 # Here, there is a name collision, but it still could be ok if
13990                 # the tables match the identical set of code points, in which
13991                 # case, we can combine the names.  Compare each table's code
13992                 # point list to see if they are identical.
13993                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
13994                 if ($pre_existing->matches_identically_to($actual)) {
13995
13996                     # Here, they do match identically.  Not a real conflict.
13997                     # Make the perl version a child of the Unicode one, except
13998                     # in the non-obvious case of where the perl name is
13999                     # already a synonym of another Unicode property.  (This is
14000                     # excluded by the test for it being its own parent.)  The
14001                     # reason for this exclusion is that then the two Unicode
14002                     # properties become related; and we don't really know if
14003                     # they are or not.  We generate documentation based on
14004                     # relatedness, and this would be misleading.  Code
14005                     # later executed in the process will cause the tables to
14006                     # be represented by a single file anyway, without making
14007                     # it look in the pod like they are necessarily related.
14008                     if ($pre_existing->parent == $pre_existing
14009                         && ($pre_existing->property == $perl
14010                             || $actual->property == $perl))
14011                     {
14012                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
14013                         $pre_existing->set_equivalent_to($actual, Related => 1);
14014                     }
14015                     elsif (main::DEBUG && $to_trace) {
14016                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
14017                         trace $pre_existing->parent;
14018                     }
14019                     next PREFIX;
14020                 }
14021
14022                 # Here they didn't match identically, there is a real conflict
14023                 # between our new name and a pre-existing property.
14024                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
14025                 $pre_existing->add_conflicting($nominal->full_name,
14026                                                'p',
14027                                                $actual);
14028
14029                 # Don't output a warning for aliases for the block
14030                 # properties (unless they start with 'In_') as it is
14031                 # expected that there will be conflicts and the block
14032                 # form loses.
14033                 if ($verbosity >= $NORMAL_VERBOSITY
14034                     && ($actual->property != $block || $prefix eq 'In_'))
14035                 {
14036                     print simple_fold(join_lines(<<END
14037 There is already an alias named $proposed_name (from $pre_existing),
14038 so not creating this alias for $actual
14039 END
14040                     ), "", 4);
14041                 }
14042
14043                 # Keep track for documentation purposes.
14044                 $has_In_conflicts++ if $prefix eq 'In_';
14045                 $has_Is_conflicts++ if $prefix eq 'Is_';
14046             }
14047         }
14048     }
14049
14050     # There are some properties which have No and Yes (and N and Y) as
14051     # property values, but aren't binary, and could possibly be confused with
14052     # binary ones.  So create caveats for them.  There are tables that are
14053     # named 'No', and tables that are named 'N', but confusion is not likely
14054     # unless they are the same table.  For example, N meaning Number or
14055     # Neutral is not likely to cause confusion, so don't add caveats to things
14056     # like them.
14057     foreach my $property (grep { $_->type != $BINARY
14058                                  && $_->type != $FORCED_BINARY }
14059                                                             property_ref('*'))
14060     {
14061         my $yes = $property->table('Yes');
14062         if (defined $yes) {
14063             my $y = $property->table('Y');
14064             if (defined $y && $yes == $y) {
14065                 foreach my $alias ($property->aliases) {
14066                     $yes->add_conflicting($alias->name);
14067                 }
14068             }
14069         }
14070         my $no = $property->table('No');
14071         if (defined $no) {
14072             my $n = $property->table('N');
14073             if (defined $n && $no == $n) {
14074                 foreach my $alias ($property->aliases) {
14075                     $no->add_conflicting($alias->name, 'P');
14076                 }
14077             }
14078         }
14079     }
14080
14081     return;
14082 }
14083
14084 sub register_file_for_name($$$) {
14085     # Given info about a table and a datafile that it should be associated
14086     # with, register that association
14087
14088     my $table = shift;
14089     my $directory_ref = shift;   # Array of the directory path for the file
14090     my $file = shift;            # The file name in the final directory.
14091     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14092
14093     trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
14094
14095     if ($table->isa('Property')) {
14096         $table->set_file_path(@$directory_ref, $file);
14097         push @map_properties, $table;
14098
14099         # No swash means don't do the rest of this.
14100         return if $table->fate != $ORDINARY;
14101
14102         # Get the path to the file
14103         my @path = $table->file_path;
14104
14105         # Use just the file name if no subdirectory.
14106         shift @path if $path[0] eq File::Spec->curdir();
14107
14108         my $file = join '/', @path;
14109
14110         # Create a hash entry for utf8_heavy to get the file that stores this
14111         # property's map table
14112         foreach my $alias ($table->aliases) {
14113             my $name = $alias->name;
14114             $loose_property_to_file_of{standardize($name)} = $file;
14115         }
14116
14117         # And a way for utf8_heavy to find the proper key in the SwashInfo
14118         # hash for this property.
14119         $file_to_swash_name{$file} = "To" . $table->swash_name;
14120         return;
14121     }
14122
14123     # Do all of the work for all equivalent tables when called with the leader
14124     # table, so skip if isn't the leader.
14125     return if $table->leader != $table;
14126
14127     # If this is a complement of another file, use that other file instead,
14128     # with a ! prepended to it.
14129     my $complement;
14130     if (($complement = $table->complement) != 0) {
14131         my @directories = $complement->file_path;
14132
14133         # This assumes that the 0th element is something like 'lib',
14134         # the 1th element the property name (in its own directory), like
14135         # 'AHex', and the 2th element the file like 'Y' which will have a .pl
14136         # appended to it later.
14137         $directories[1] =~ s/^/!/;
14138         $file = pop @directories;
14139         $directory_ref =\@directories;
14140     }
14141
14142     # Join all the file path components together, using slashes.
14143     my $full_filename = join('/', @$directory_ref, $file);
14144
14145     # All go in the same subdirectory of unicore
14146     if ($directory_ref->[0] ne $matches_directory) {
14147         Carp::my_carp("Unexpected directory in "
14148                 .  join('/', @{$directory_ref}, $file));
14149     }
14150
14151     # For this table and all its equivalents ...
14152     foreach my $table ($table, $table->equivalents) {
14153
14154         # Associate it with its file internally.  Don't include the
14155         # $matches_directory first component
14156         $table->set_file_path(@$directory_ref, $file);
14157
14158         # No swash means don't do the rest of this.
14159         next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
14160
14161         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
14162
14163         my $property = $table->property;
14164         my $property_name = ($property == $perl)
14165                              ? ""  # 'perl' is never explicitly stated
14166                              : standardize($property->name) . '=';
14167
14168         my $is_default = 0; # Is this table the default one for the property?
14169
14170         # To calculate $is_default, we find if this table is the same as the
14171         # default one for the property.  But this is complicated by the
14172         # possibility that there is a master table for this one, and the
14173         # information is stored there instead of here.
14174         my $parent = $table->parent;
14175         my $leader_prop = $parent->property;
14176         my $default_map = $leader_prop->default_map;
14177         if (defined $default_map) {
14178             my $default_table = $leader_prop->table($default_map);
14179             $is_default = 1 if defined $default_table && $parent == $default_table;
14180         }
14181
14182         # Calculate the loose name for this table.  Mostly it's just its name,
14183         # standardized.  But in the case of Perl tables that are single-form
14184         # equivalents to Unicode properties, it is the latter's name.
14185         my $loose_table_name =
14186                         ($property != $perl || $leader_prop == $perl)
14187                         ? standardize($table->name)
14188                         : standardize($parent->name);
14189
14190         my $deprecated = ($table->status eq $DEPRECATED)
14191                          ? $table->status_info
14192                          : "";
14193         my $caseless_equivalent = $table->caseless_equivalent;
14194
14195         # And for each of the table's aliases...  This inner loop eventually
14196         # goes through all aliases in the UCD that we generate regex match
14197         # files for
14198         foreach my $alias ($table->aliases) {
14199             my $standard = utf8_heavy_name($table, $alias);
14200
14201             # Generate an entry in either the loose or strict hashes, which
14202             # will translate the property and alias names combination into the
14203             # file where the table for them is stored.
14204             if ($alias->loose_match) {
14205                 if (exists $loose_to_file_of{$standard}) {
14206                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
14207                 }
14208                 else {
14209                     $loose_to_file_of{$standard} = $sub_filename;
14210                 }
14211             }
14212             else {
14213                 if (exists $stricter_to_file_of{$standard}) {
14214                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
14215                 }
14216                 else {
14217                     $stricter_to_file_of{$standard} = $sub_filename;
14218
14219                     # Tightly coupled with how utf8_heavy.pl works, for a
14220                     # floating point number that is a whole number, get rid of
14221                     # the trailing decimal point and 0's, so that utf8_heavy
14222                     # will work.  Also note that this assumes that such a
14223                     # number is matched strictly; so if that were to change,
14224                     # this would be wrong.
14225                     if ((my $integer_name = $alias->name)
14226                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
14227                     {
14228                         $stricter_to_file_of{$property_name . $integer_name}
14229                                                             = $sub_filename;
14230                     }
14231                 }
14232             }
14233
14234             # For Unicode::UCD, create a mapping of the prop=value to the
14235             # canonical =value for that property.
14236             if ($standard =~ /=/) {
14237
14238                 # This could happen if a strict name mapped into an existing
14239                 # loose name.  In that event, the strict names would have to
14240                 # be moved to a new hash.
14241                 if (exists($loose_to_standard_value{$standard})) {
14242                     Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
14243                 }
14244                 $loose_to_standard_value{$standard} = $loose_table_name;
14245             }
14246
14247             # Keep a list of the deprecated properties and their filenames
14248             if ($deprecated && $complement == 0) {
14249                 $utf8::why_deprecated{$sub_filename} = $deprecated;
14250             }
14251
14252             # And a substitute table, if any, for case-insensitive matching
14253             if ($caseless_equivalent != 0) {
14254                 $caseless_equivalent_to{$standard} = $caseless_equivalent;
14255             }
14256
14257             # Add to defaults list if the table this alias belongs to is the
14258             # default one
14259             $loose_defaults{$standard} = 1 if $is_default;
14260         }
14261     }
14262
14263     return;
14264 }
14265
14266 {   # Closure
14267     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
14268                      # conflicts
14269     my %full_dir_name_of;   # Full length names of directories used.
14270
14271     sub construct_filename($$$) {
14272         # Return a file name for a table, based on the table name, but perhaps
14273         # changed to get rid of non-portable characters in it, and to make
14274         # sure that it is unique on a file system that allows the names before
14275         # any period to be at most 8 characters (DOS).  While we're at it
14276         # check and complain if there are any directory conflicts.
14277
14278         my $name = shift;       # The name to start with
14279         my $mutable = shift;    # Boolean: can it be changed?  If no, but
14280                                 # yet it must be to work properly, a warning
14281                                 # is given
14282         my $directories_ref = shift;  # A reference to an array containing the
14283                                 # path to the file, with each element one path
14284                                 # component.  This is used because the same
14285                                 # name can be used in different directories.
14286         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14287
14288         my $warn = ! defined wantarray;  # If true, then if the name is
14289                                 # changed, a warning is issued as well.
14290
14291         if (! defined $name) {
14292             Carp::my_carp("Undefined name in directory "
14293                           . File::Spec->join(@$directories_ref)
14294                           . ". '_' used");
14295             return '_';
14296         }
14297
14298         # Make sure that no directory names conflict with each other.  Look at
14299         # each directory in the input file's path.  If it is already in use,
14300         # assume it is correct, and is merely being re-used, but if we
14301         # truncate it to 8 characters, and find that there are two directories
14302         # that are the same for the first 8 characters, but differ after that,
14303         # then that is a problem.
14304         foreach my $directory (@$directories_ref) {
14305             my $short_dir = substr($directory, 0, 8);
14306             if (defined $full_dir_name_of{$short_dir}) {
14307                 next if $full_dir_name_of{$short_dir} eq $directory;
14308                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
14309             }
14310             else {
14311                 $full_dir_name_of{$short_dir} = $directory;
14312             }
14313         }
14314
14315         my $path = join '/', @$directories_ref;
14316         $path .= '/' if $path;
14317
14318         # Remove interior underscores.
14319         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
14320
14321         # Change any non-word character into an underscore, and truncate to 8.
14322         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
14323         substr($filename, 8) = "" if length($filename) > 8;
14324
14325         # Make sure the basename doesn't conflict with something we
14326         # might have already written. If we have, say,
14327         #     InGreekExtended1
14328         #     InGreekExtended2
14329         # they become
14330         #     InGreekE
14331         #     InGreek2
14332         my $warned = 0;
14333         while (my $num = $base_names{$path}{lc $filename}++) {
14334             $num++; # so basenames with numbers start with '2', which
14335                     # just looks more natural.
14336
14337             # Want to append $num, but if it'll make the basename longer
14338             # than 8 characters, pre-truncate $filename so that the result
14339             # is acceptable.
14340             my $delta = length($filename) + length($num) - 8;
14341             if ($delta > 0) {
14342                 substr($filename, -$delta) = $num;
14343             }
14344             else {
14345                 $filename .= $num;
14346             }
14347             if ($warn && ! $warned) {
14348                 $warned = 1;
14349                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
14350             }
14351         }
14352
14353         return $filename if $mutable;
14354
14355         # If not changeable, must return the input name, but warn if needed to
14356         # change it beyond shortening it.
14357         if ($name ne $filename
14358             && substr($name, 0, length($filename)) ne $filename) {
14359             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
14360         }
14361         return $name;
14362     }
14363 }
14364
14365 # The pod file contains a very large table.  Many of the lines in that table
14366 # would exceed a typical output window's size, and so need to be wrapped with
14367 # a hanging indent to make them look good.  The pod language is really
14368 # insufficient here.  There is no general construct to do that in pod, so it
14369 # is done here by beginning each such line with a space to cause the result to
14370 # be output without formatting, and doing all the formatting here.  This leads
14371 # to the result that if the eventual display window is too narrow it won't
14372 # look good, and if the window is too wide, no advantage is taken of that
14373 # extra width.  A further complication is that the output may be indented by
14374 # the formatter so that there is less space than expected.  What I (khw) have
14375 # done is to assume that that indent is a particular number of spaces based on
14376 # what it is in my Linux system;  people can always resize their windows if
14377 # necessary, but this is obviously less than desirable, but the best that can
14378 # be expected.
14379 my $automatic_pod_indent = 8;
14380
14381 # Try to format so that uses fewest lines, but few long left column entries
14382 # slide into the right column.  An experiment on 5.1 data yielded the
14383 # following percentages that didn't cut into the other side along with the
14384 # associated first-column widths
14385 # 69% = 24
14386 # 80% not too bad except for a few blocks
14387 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
14388 # 95% = 37;
14389 my $indent_info_column = 27;    # 75% of lines didn't have overlap
14390
14391 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
14392                     # The 3 is because of:
14393                     #   1   for the leading space to tell the pod formatter to
14394                     #       output as-is
14395                     #   1   for the flag
14396                     #   1   for the space between the flag and the main data
14397
14398 sub format_pod_line ($$$;$$) {
14399     # Take a pod line and return it, formatted properly
14400
14401     my $first_column_width = shift;
14402     my $entry = shift;  # Contents of left column
14403     my $info = shift;   # Contents of right column
14404
14405     my $status = shift || "";   # Any flag
14406
14407     my $loose_match = shift;    # Boolean.
14408     $loose_match = 1 unless defined $loose_match;
14409
14410     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14411
14412     my $flags = "";
14413     $flags .= $STRICTER if ! $loose_match;
14414
14415     $flags .= $status if $status;
14416
14417     # There is a blank in the left column to cause the pod formatter to
14418     # output the line as-is.
14419     return sprintf " %-*s%-*s %s\n",
14420                     # The first * in the format is replaced by this, the -1 is
14421                     # to account for the leading blank.  There isn't a
14422                     # hard-coded blank after this to separate the flags from
14423                     # the rest of the line, so that in the unlikely event that
14424                     # multiple flags are shown on the same line, they both
14425                     # will get displayed at the expense of that separation,
14426                     # but since they are left justified, a blank will be
14427                     # inserted in the normal case.
14428                     $FILLER - 1,
14429                     $flags,
14430
14431                     # The other * in the format is replaced by this number to
14432                     # cause the first main column to right fill with blanks.
14433                     # The -1 is for the guaranteed blank following it.
14434                     $first_column_width - $FILLER - 1,
14435                     $entry,
14436                     $info;
14437 }
14438
14439 my @zero_match_tables;  # List of tables that have no matches in this release
14440
14441 sub make_re_pod_entries($) {
14442     # This generates the entries for the pod file for a given table.
14443     # Also done at this time are any children tables.  The output looks like:
14444     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
14445
14446     my $input_table = shift;        # Table the entry is for
14447     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14448
14449     # Generate parent and all its children at the same time.
14450     return if $input_table->parent != $input_table;
14451
14452     my $property = $input_table->property;
14453     my $type = $property->type;
14454     my $full_name = $property->full_name;
14455
14456     my $count = $input_table->count;
14457     my $string_count = clarify_number($count);
14458     my $status = $input_table->status;
14459     my $status_info = $input_table->status_info;
14460     my $caseless_equivalent = $input_table->caseless_equivalent;
14461
14462     # Don't mention a placeholder equivalent as it isn't to be listed in the
14463     # pod
14464     $caseless_equivalent = 0 if $caseless_equivalent != 0
14465                                 && $caseless_equivalent->fate > $ORDINARY;
14466
14467     my $entry_for_first_table; # The entry for the first table output.
14468                            # Almost certainly, it is the parent.
14469
14470     # For each related table (including itself), we will generate a pod entry
14471     # for each name each table goes by
14472     foreach my $table ($input_table, $input_table->children) {
14473
14474         # utf8_heavy.pl cannot deal with null string property values, so skip
14475         # any tables that have no non-null names.
14476         next if ! grep { $_->name ne "" } $table->aliases;
14477
14478         # First, gather all the info that applies to this table as a whole.
14479
14480         push @zero_match_tables, $table if $count == 0
14481                                             # Don't mention special tables
14482                                             # as being zero length
14483                                            && $table->fate == $ORDINARY;
14484
14485         my $table_property = $table->property;
14486
14487         # The short name has all the underscores removed, while the full name
14488         # retains them.  Later, we decide whether to output a short synonym
14489         # for the full one, we need to compare apples to apples, so we use the
14490         # short name's length including underscores.
14491         my $table_property_short_name_length;
14492         my $table_property_short_name
14493             = $table_property->short_name(\$table_property_short_name_length);
14494         my $table_property_full_name = $table_property->full_name;
14495
14496         # Get how much savings there is in the short name over the full one
14497         # (delta will always be <= 0)
14498         my $table_property_short_delta = $table_property_short_name_length
14499                                          - length($table_property_full_name);
14500         my @table_description = $table->description;
14501         my @table_note = $table->note;
14502
14503         # Generate an entry for each alias in this table.
14504         my $entry_for_first_alias;  # saves the first one encountered.
14505         foreach my $alias ($table->aliases) {
14506
14507             # Skip if not to go in pod.
14508             next unless $alias->make_re_pod_entry;
14509
14510             # Start gathering all the components for the entry
14511             my $name = $alias->name;
14512
14513             # Skip if name is empty, as can't be accessed by regexes.
14514             next if $name eq "";
14515
14516             my $entry;      # Holds the left column, may include extras
14517             my $entry_ref;  # To refer to the left column's contents from
14518                             # another entry; has no extras
14519
14520             # First the left column of the pod entry.  Tables for the $perl
14521             # property always use the single form.
14522             if ($table_property == $perl) {
14523                 $entry = "\\p{$name}";
14524                 $entry_ref = "\\p{$name}";
14525             }
14526             else {    # Compound form.
14527
14528                 # Only generate one entry for all the aliases that mean true
14529                 # or false in binary properties.  Append a '*' to indicate
14530                 # some are missing.  (The heading comment notes this.)
14531                 my $rhs;
14532                 if ($type == $BINARY) {
14533                     next if $name ne 'N' && $name ne 'Y';
14534                     $rhs = "$name*";
14535                 }
14536                 elsif ($type != $FORCED_BINARY) {
14537                     $rhs = $name;
14538                 }
14539                 else {
14540
14541                     # Forced binary properties require special handling.  It
14542                     # has two sets of tables, one set is true/false; and the
14543                     # other set is everything else.  Entries are generated for
14544                     # each set.  Use the Bidi_Mirrored property (which appears
14545                     # in all Unicode versions) to get a list of the aliases
14546                     # for the true/false tables.  Of these, only output the N
14547                     # and Y ones, the same as, a regular binary property.  And
14548                     # output all the rest, same as a non-binary property.
14549                     my $bm = property_ref("Bidi_Mirrored");
14550                     if ($name eq 'N' || $name eq 'Y') {
14551                         $rhs = "$name*";
14552                     } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
14553                                                         $bm->table("N")->aliases)
14554                     {
14555                         next;
14556                     }
14557                     else {
14558                         $rhs = $name;
14559                     }
14560                 }
14561
14562                 # Colon-space is used to give a little more space to be easier
14563                 # to read;
14564                 $entry = "\\p{"
14565                         . $table_property_full_name
14566                         . ": $rhs}";
14567
14568                 # But for the reference to this entry, which will go in the
14569                 # right column, where space is at a premium, use equals
14570                 # without a space
14571                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
14572             }
14573
14574             # Then the right (info) column.  This is stored as components of
14575             # an array for the moment, then joined into a string later.  For
14576             # non-internal only properties, begin the info with the entry for
14577             # the first table we encountered (if any), as things are ordered
14578             # so that that one is the most descriptive.  This leads to the
14579             # info column of an entry being a more descriptive version of the
14580             # name column
14581             my @info;
14582             if ($name =~ /^_/) {
14583                 push @info,
14584                         '(For internal use by Perl, not necessarily stable)';
14585             }
14586             elsif ($entry_for_first_alias) {
14587                 push @info, $entry_for_first_alias;
14588             }
14589
14590             # If this entry is equivalent to another, add that to the info,
14591             # using the first such table we encountered
14592             if ($entry_for_first_table) {
14593                 if (@info) {
14594                     push @info, "(= $entry_for_first_table)";
14595                 }
14596                 else {
14597                     push @info, $entry_for_first_table;
14598                 }
14599             }
14600
14601             # If the name is a large integer, add an equivalent with an
14602             # exponent for better readability
14603             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
14604                 push @info, sprintf "(= %.1e)", $name
14605             }
14606
14607             my $parenthesized = "";
14608             if (! $entry_for_first_alias) {
14609
14610                 # This is the first alias for the current table.  The alias
14611                 # array is ordered so that this is the fullest, most
14612                 # descriptive alias, so it gets the fullest info.  The other
14613                 # aliases are mostly merely pointers to this one, using the
14614                 # information already added above.
14615
14616                 # Display any status message, but only on the parent table
14617                 if ($status && ! $entry_for_first_table) {
14618                     push @info, $status_info;
14619                 }
14620
14621                 # Put out any descriptive info
14622                 if (@table_description || @table_note) {
14623                     push @info, join "; ", @table_description, @table_note;
14624                 }
14625
14626                 # Look to see if there is a shorter name we can point people
14627                 # at
14628                 my $standard_name = standardize($name);
14629                 my $short_name;
14630                 my $proposed_short = $table->short_name;
14631                 if (defined $proposed_short) {
14632                     my $standard_short = standardize($proposed_short);
14633
14634                     # If the short name is shorter than the standard one, or
14635                     # even it it's not, but the combination of it and its
14636                     # short property name (as in \p{prop=short} ($perl doesn't
14637                     # have this form)) saves at least two characters, then,
14638                     # cause it to be listed as a shorter synonym.
14639                     if (length $standard_short < length $standard_name
14640                         || ($table_property != $perl
14641                             && (length($standard_short)
14642                                 - length($standard_name)
14643                                 + $table_property_short_delta)  # (<= 0)
14644                                 < -2))
14645                     {
14646                         $short_name = $proposed_short;
14647                         if ($table_property != $perl) {
14648                             $short_name = $table_property_short_name
14649                                           . "=$short_name";
14650                         }
14651                         $short_name = "\\p{$short_name}";
14652                     }
14653                 }
14654
14655                 # And if this is a compound form name, see if there is a
14656                 # single form equivalent
14657                 my $single_form;
14658                 if ($table_property != $perl) {
14659
14660                     # Special case the binary N tables, so that will print
14661                     # \P{single}, but use the Y table values to populate
14662                     # 'single', as we haven't likewise populated the N table.
14663                     # For forced binary tables, we can't just look at the N
14664                     # table, but must see if this table is equivalent to the N
14665                     # one, as there are two equivalent beasts in these
14666                     # properties.
14667                     my $test_table;
14668                     my $p;
14669                     if (   ($type == $BINARY
14670                             && $input_table == $property->table('No'))
14671                         || ($type == $FORCED_BINARY
14672                             && $property->table('No')->
14673                                         is_set_equivalent_to($input_table)))
14674                     {
14675                         $test_table = $property->table('Yes');
14676                         $p = 'P';
14677                     }
14678                     else {
14679                         $test_table = $input_table;
14680                         $p = 'p';
14681                     }
14682
14683                     # Look for a single form amongst all the children.
14684                     foreach my $table ($test_table->children) {
14685                         next if $table->property != $perl;
14686                         my $proposed_name = $table->short_name;
14687                         next if ! defined $proposed_name;
14688
14689                         # Don't mention internal-only properties as a possible
14690                         # single form synonym
14691                         next if substr($proposed_name, 0, 1) eq '_';
14692
14693                         $proposed_name = "\\$p\{$proposed_name}";
14694                         if (! defined $single_form
14695                             || length($proposed_name) < length $single_form)
14696                         {
14697                             $single_form = $proposed_name;
14698
14699                             # The goal here is to find a single form; not the
14700                             # shortest possible one.  We've already found a
14701                             # short name.  So, stop at the first single form
14702                             # found, which is likely to be closer to the
14703                             # original.
14704                             last;
14705                         }
14706                     }
14707                 }
14708
14709                 # Ouput both short and single in the same parenthesized
14710                 # expression, but with only one of 'Single', 'Short' if there
14711                 # are both items.
14712                 if ($short_name || $single_form || $table->conflicting) {
14713                     $parenthesized .= "Short: $short_name" if $short_name;
14714                     if ($short_name && $single_form) {
14715                         $parenthesized .= ', ';
14716                     }
14717                     elsif ($single_form) {
14718                         $parenthesized .= 'Single: ';
14719                     }
14720                     $parenthesized .= $single_form if $single_form;
14721                 }
14722             }
14723
14724             if ($caseless_equivalent != 0) {
14725                 $parenthesized .=  '; ' if $parenthesized ne "";
14726                 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
14727             }
14728
14729
14730             # Warn if this property isn't the same as one that a
14731             # semi-casual user might expect.  The other components of this
14732             # parenthesized structure are calculated only for the first entry
14733             # for this table, but the conflicting is deemed important enough
14734             # to go on every entry.
14735             my $conflicting = join " NOR ", $table->conflicting;
14736             if ($conflicting) {
14737                 $parenthesized .=  '; ' if $parenthesized ne "";
14738                 $parenthesized .= "NOT $conflicting";
14739             }
14740
14741             push @info, "($parenthesized)" if $parenthesized;
14742
14743             if ($name =~ /_$/ && $alias->loose_match) {
14744                 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
14745             }
14746
14747             if ($table_property != $perl && $table->perl_extension) {
14748                 push @info, '(Perl extension)';
14749             }
14750             push @info, "($string_count)";
14751
14752             # Now, we have both the entry and info so add them to the
14753             # list of all the properties.
14754             push @match_properties,
14755                 format_pod_line($indent_info_column,
14756                                 $entry,
14757                                 join( " ", @info),
14758                                 $alias->status,
14759                                 $alias->loose_match);
14760
14761             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
14762         } # End of looping through the aliases for this table.
14763
14764         if (! $entry_for_first_table) {
14765             $entry_for_first_table = $entry_for_first_alias;
14766         }
14767     } # End of looping through all the related tables
14768     return;
14769 }
14770
14771 sub make_ucd_table_pod_entries {
14772     my $table = shift;
14773
14774     # Generate the entries for the UCD section of the pod for $table.  This
14775     # also calculates if names are ambiguous, so has to be called even if the
14776     # pod is not being output
14777
14778     my $short_name = $table->name;
14779     my $standard_short_name = standardize($short_name);
14780     my $full_name = $table->full_name;
14781     my $standard_full_name = standardize($full_name);
14782
14783     my $full_info = "";     # Text of info column for full-name entries
14784     my $other_info = "";    # Text of info column for short-name entries
14785     my $short_info = "";    # Text of info column for other entries
14786     my $meaning = "";       # Synonym of this table
14787
14788     my $property = ($table->isa('Property'))
14789                    ? $table
14790                    : $table->parent->property;
14791
14792     my $perl_extension = $table->perl_extension;
14793
14794     # Get the more official name for for perl extensions that aren't
14795     # stand-alone properties
14796     if ($perl_extension && $property != $table) {
14797         if ($property == $perl ||$property->type == $BINARY) {
14798             $meaning = $table->complete_name;
14799         }
14800         else {
14801             $meaning = $property->full_name . "=$full_name";
14802         }
14803     }
14804
14805     # There are three types of info column.  One for the short name, one for
14806     # the full name, and one for everything else.  They mostly are the same,
14807     # so initialize in the same loop.
14808     foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
14809         if ($perl_extension && $property != $table) {
14810
14811             # Add the synonymous name for the non-full name entries; and to
14812             # the full-name entry if it adds extra information
14813             if ($info_ref == \$other_info
14814                 || ($info_ref == \$short_info
14815                     && $standard_short_name ne $standard_full_name)
14816                 || standardize($meaning) ne $standard_full_name
14817             ) {
14818                 $$info_ref .= "$meaning.";
14819             }
14820         }
14821         elsif ($info_ref != \$full_info) {
14822
14823             # Otherwise, the non-full name columns include the full name
14824             $$info_ref .= $full_name;
14825         }
14826
14827         # And the full-name entry includes the short name, if different
14828         if ($info_ref == \$full_info
14829             && $standard_short_name ne $standard_full_name)
14830         {
14831             $full_info =~ s/\.\Z//;
14832             $full_info .= "  " if $full_info;
14833             $full_info .= "(Short: $short_name)";
14834         }
14835
14836         if ($table->perl_extension) {
14837             $$info_ref =~ s/\.\Z//;
14838             $$info_ref .= ".  " if $$info_ref;
14839             $$info_ref .= "(Perl extension)";
14840         }
14841     }
14842
14843     # Add any extra annotations to the full name entry
14844     foreach my $more_info ($table->description,
14845                             $table->note,
14846                             $table->status_info)
14847     {
14848         next unless $more_info;
14849         $full_info =~ s/\.\Z//;
14850         $full_info .= ".  " if $full_info;
14851         $full_info .= $more_info;
14852     }
14853
14854     # These keep track if have created full and short name pod entries for the
14855     # property
14856     my $done_full = 0;
14857     my $done_short = 0;
14858
14859     # Every possible name is kept track of, even those that aren't going to be
14860     # output.  This way we can be sure to find the ambiguities.
14861     foreach my $alias ($table->aliases) {
14862         my $name = $alias->name;
14863         my $standard = standardize($name);
14864         my $info;
14865         my $output_this = $alias->ucd;
14866
14867         # If the full and short names are the same, we want to output the full
14868         # one's entry, so it has priority.
14869         if ($standard eq $standard_full_name) {
14870             next if $done_full;
14871             $done_full = 1;
14872             $info = $full_info;
14873         }
14874         elsif ($standard eq $standard_short_name) {
14875             next if $done_short;
14876             $done_short = 1;
14877             next if $standard_short_name eq $standard_full_name;
14878             $info = $short_info;
14879         }
14880         else {
14881             $info = $other_info;
14882         }
14883
14884         # Here, we have set up the two columns for this entry.  But if an
14885         # entry already exists for this name, we have to decide which one
14886         # we're going to later output.
14887         if (exists $ucd_pod{$standard}) {
14888
14889             # If the two entries refer to the same property, it's not going to
14890             # be ambiguous.  (Likely it's because the names when standardized
14891             # are the same.)  But that means if they are different properties,
14892             # there is ambiguity.
14893             if ($ucd_pod{$standard}->{'property'} != $property) {
14894
14895                 # Here, we have an ambiguity.  This code assumes that one is
14896                 # scheduled to be output and one not and that one is a perl
14897                 # extension (which is not to be output) and the other isn't.
14898                 # If those assumptions are wrong, things have to be rethought.
14899                 if ($ucd_pod{$standard}{'output_this'} == $output_this
14900                     || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
14901                     || $output_this == $perl_extension)
14902                 {
14903                     Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
14904                 }
14905
14906                 # We modifiy the info column of the one being output to
14907                 # indicate the ambiguity.  Set $which to point to that one's
14908                 # info.
14909                 my $which;
14910                 if ($ucd_pod{$standard}{'output_this'}) {
14911                     $which = \$ucd_pod{$standard}->{'info'};
14912                 }
14913                 else {
14914                     $which = \$info;
14915                     $meaning = $ucd_pod{$standard}{'meaning'};
14916                 }
14917
14918                 chomp $$which;
14919                 $$which =~ s/\.\Z//;
14920                 $$which .= "; NOT '$standard' meaning '$meaning'";
14921
14922                 $ambiguous_names{$standard} = 1;
14923             }
14924
14925             # Use the non-perl-extension variant
14926             next unless $ucd_pod{$standard}{'perl_extension'};
14927         }
14928
14929         # Store enough information about this entry that we can later look for
14930         # ambiguities, and output it properly.
14931         $ucd_pod{$standard} = { 'name' => $name,
14932                                 'info' => $info,
14933                                 'meaning' => $meaning,
14934                                 'output_this' => $output_this,
14935                                 'perl_extension' => $perl_extension,
14936                                 'property' => $property,
14937                                 'status' => $alias->status,
14938         };
14939     } # End of looping through all this table's aliases
14940
14941     return;
14942 }
14943
14944 sub pod_alphanumeric_sort {
14945     # Sort pod entries alphanumerically.
14946
14947     # The first few character columns are filler, plus the '\p{'; and get rid
14948     # of all the trailing stuff, starting with the trailing '}', so as to sort
14949     # on just 'Name=Value'
14950     (my $a = lc $a) =~ s/^ .*? { //x;
14951     $a =~ s/}.*//;
14952     (my $b = lc $b) =~ s/^ .*? { //x;
14953     $b =~ s/}.*//;
14954
14955     # Determine if the two operands are both internal only or both not.
14956     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
14957     # should be the underscore that begins internal only
14958     my $a_is_internal = (substr($a, 0, 1) eq '_');
14959     my $b_is_internal = (substr($b, 0, 1) eq '_');
14960
14961     # Sort so the internals come last in the table instead of first (which the
14962     # leading underscore would otherwise indicate).
14963     if ($a_is_internal != $b_is_internal) {
14964         return 1 if $a_is_internal;
14965         return -1
14966     }
14967
14968     # Determine if the two operands are numeric property values or not.
14969     # A numeric property will look like xyz: 3.  But the number
14970     # can begin with an optional minus sign, and may have a
14971     # fraction or rational component, like xyz: 3/2.  If either
14972     # isn't numeric, use alphabetic sort.
14973     my ($a_initial, $a_number) =
14974         ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
14975     return $a cmp $b unless defined $a_number;
14976     my ($b_initial, $b_number) =
14977         ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
14978     return $a cmp $b unless defined $b_number;
14979
14980     # Here they are both numeric, but use alphabetic sort if the
14981     # initial parts don't match
14982     return $a cmp $b if $a_initial ne $b_initial;
14983
14984     # Convert rationals to floating for the comparison.
14985     $a_number = eval $a_number if $a_number =~ qr{/};
14986     $b_number = eval $b_number if $b_number =~ qr{/};
14987
14988     return $a_number <=> $b_number;
14989 }
14990
14991 sub make_pod () {
14992     # Create the .pod file.  This generates the various subsections and then
14993     # combines them in one big HERE document.
14994
14995     my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
14996
14997     return unless defined $pod_directory;
14998     print "Making pod file\n" if $verbosity >= $PROGRESS;
14999
15000     my $exception_message =
15001     '(Any exceptions are individually noted beginning with the word NOT.)';
15002     my @block_warning;
15003     if (-e 'Blocks.txt') {
15004
15005         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
15006         # if the global $has_In_conflicts indicates we have them.
15007         push @match_properties, format_pod_line($indent_info_column,
15008                                                 '\p{In_*}',
15009                                                 '\p{Block: *}'
15010                                                     . (($has_In_conflicts)
15011                                                       ? " $exception_message"
15012                                                       : ""));
15013         @block_warning = << "END";
15014
15015 Matches in the Block property have shortcuts that begin with "In_".  For
15016 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>.  For
15017 backward compatibility, if there is no conflict with another shortcut, these
15018 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>.  But, N.B., there
15019 are numerous such conflicting shortcuts.  Use of these forms for Block is
15020 discouraged, and are flagged as such, not only because of the potential
15021 confusion as to what is meant, but also because a later release of Unicode may
15022 preempt the shortcut, and your program would no longer be correct.  Use the
15023 "In_" form instead to avoid this, or even more clearly, use the compound form,
15024 e.g., C<\\p{blk:latin1}>.  See L<perlunicode/"Blocks"> for more information
15025 about this.
15026 END
15027     }
15028     my $text = $Is_flags_text;
15029     $text = "$exception_message $text" if $has_Is_conflicts;
15030
15031     # And the 'Is_ line';
15032     push @match_properties, format_pod_line($indent_info_column,
15033                                             '\p{Is_*}',
15034                                             "\\p{*} $text");
15035
15036     # Sort the properties array for output.  It is sorted alphabetically
15037     # except numerically for numeric properties, and only output unique lines.
15038     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
15039
15040     my $formatted_properties = simple_fold(\@match_properties,
15041                                         "",
15042                                         # indent succeeding lines by two extra
15043                                         # which looks better
15044                                         $indent_info_column + 2,
15045
15046                                         # shorten the line length by how much
15047                                         # the formatter indents, so the folded
15048                                         # line will fit in the space
15049                                         # presumably available
15050                                         $automatic_pod_indent);
15051     # Add column headings, indented to be a little more centered, but not
15052     # exactly
15053     $formatted_properties =  format_pod_line($indent_info_column,
15054                                                     '    NAME',
15055                                                     '           INFO')
15056                                     . "\n"
15057                                     . $formatted_properties;
15058
15059     # Generate pod documentation lines for the tables that match nothing
15060     my $zero_matches = "";
15061     if (@zero_match_tables) {
15062         @zero_match_tables = uniques(@zero_match_tables);
15063         $zero_matches = join "\n\n",
15064                         map { $_ = '=item \p{' . $_->complete_name . "}" }
15065                             sort { $a->complete_name cmp $b->complete_name }
15066                             @zero_match_tables;
15067
15068         $zero_matches = <<END;
15069
15070 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
15071
15072 Unicode has some property-value pairs that currently don't match anything.
15073 This happens generally either because they are obsolete, or they exist for
15074 symmetry with other forms, but no language has yet been encoded that uses
15075 them.  In this version of Unicode, the following match zero code points:
15076
15077 =over 4
15078
15079 $zero_matches
15080
15081 =back
15082
15083 END
15084     }
15085
15086     # Generate list of properties that we don't accept, grouped by the reasons
15087     # why.  This is so only put out the 'why' once, and then list all the
15088     # properties that have that reason under it.
15089
15090     my %why_list;   # The keys are the reasons; the values are lists of
15091                     # properties that have the key as their reason
15092
15093     # For each property, add it to the list that are suppressed for its reason
15094     # The sort will cause the alphabetically first properties to be added to
15095     # each list first, so each list will be sorted.
15096     foreach my $property (sort keys %why_suppressed) {
15097         push @{$why_list{$why_suppressed{$property}}}, $property;
15098     }
15099
15100     # For each reason (sorted by the first property that has that reason)...
15101     my @bad_re_properties;
15102     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
15103                      keys %why_list)
15104     {
15105         # Add to the output, all the properties that have that reason.
15106         my $has_item = 0;   # Flag if actually output anything.
15107         foreach my $name (@{$why_list{$why}}) {
15108
15109             # Split compound names into $property and $table components
15110             my $property = $name;
15111             my $table;
15112             if ($property =~ / (.*) = (.*) /x) {
15113                 $property = $1;
15114                 $table = $2;
15115             }
15116
15117             # This release of Unicode may not have a property that is
15118             # suppressed, so don't reference a non-existent one.
15119             $property = property_ref($property);
15120             next if ! defined $property;
15121
15122             # And since this list is only for match tables, don't list the
15123             # ones that don't have match tables.
15124             next if ! $property->to_create_match_tables;
15125
15126             # Find any abbreviation, and turn it into a compound name if this
15127             # is a property=value pair.
15128             my $short_name = $property->name;
15129             $short_name .= '=' . $property->table($table)->name if $table;
15130
15131             # Start with an empty line.
15132             push @bad_re_properties, "\n\n" unless $has_item;
15133
15134             # And add the property as an item for the reason.
15135             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
15136             $has_item = 1;
15137         }
15138
15139         # And add the reason under the list of properties, if such a list
15140         # actually got generated.  Note that the header got added
15141         # unconditionally before.  But pod ignores extra blank lines, so no
15142         # harm.
15143         push @bad_re_properties, "\n$why\n" if $has_item;
15144
15145     } # End of looping through each reason.
15146
15147     if (! @bad_re_properties) {
15148         push @bad_re_properties,
15149                 "*** This installation accepts ALL non-Unihan properties ***";
15150     }
15151     else {
15152         # Add =over only if non-empty to avoid an empty =over/=back section,
15153         # which is considered bad form.
15154         unshift @bad_re_properties, "\n=over 4\n";
15155         push @bad_re_properties, "\n=back\n";
15156     }
15157
15158     # Similiarly, generate a list of files that we don't use, grouped by the
15159     # reasons why.  First, create a hash whose keys are the reasons, and whose
15160     # values are anonymous arrays of all the files that share that reason.
15161     my %grouped_by_reason;
15162     foreach my $file (keys %ignored_files) {
15163         push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
15164     }
15165     foreach my $file (keys %skipped_files) {
15166         push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
15167     }
15168
15169     # Then, sort each group.
15170     foreach my $group (keys %grouped_by_reason) {
15171         @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
15172                                         @{$grouped_by_reason{$group}} ;
15173     }
15174
15175     # Finally, create the output text.  For each reason (sorted by the
15176     # alphabetically first file that has that reason)...
15177     my @unused_files;
15178     foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
15179                                cmp lc $grouped_by_reason{$b}->[0]
15180                               }
15181                          keys %grouped_by_reason)
15182     {
15183         # Add all the files that have that reason to the output.  Start
15184         # with an empty line.
15185         push @unused_files, "\n\n";
15186         push @unused_files, map { "\n=item F<$_> \n" }
15187                             @{$grouped_by_reason{$reason}};
15188         # And add the reason under the list of files
15189         push @unused_files, "\n$reason\n";
15190     }
15191
15192     # Similarly, create the output text for the UCD section of the pod
15193     my @ucd_pod;
15194     foreach my $key (keys %ucd_pod) {
15195         next unless $ucd_pod{$key}->{'output_this'};
15196         push @ucd_pod, format_pod_line($indent_info_column,
15197                                        $ucd_pod{$key}->{'name'},
15198                                        $ucd_pod{$key}->{'info'},
15199                                        $ucd_pod{$key}->{'status'},
15200                                       );
15201     }
15202
15203     # Sort alphabetically, and fold for output
15204     @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
15205     my $ucd_pod = simple_fold(\@ucd_pod,
15206                            ' ',
15207                            $indent_info_column,
15208                            $automatic_pod_indent);
15209     $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
15210                 . "\n"
15211                 . $ucd_pod;
15212     local $" = "";
15213
15214     # Everything is ready to assemble.
15215     my @OUT = << "END";
15216 =begin comment
15217
15218 $HEADER
15219
15220 To change this file, edit $0 instead.
15221
15222 =end comment
15223
15224 =head1 NAME
15225
15226 $pod_file - Index of Unicode Version $string_version character properties in Perl
15227
15228 =head1 DESCRIPTION
15229
15230 This document provides information about the portion of the Unicode database
15231 that deals with character properties, that is the portion that is defined on
15232 single code points.  (L</Other information in the Unicode data base>
15233 below briefly mentions other data that Unicode provides.)
15234
15235 Perl can provide access to all non-provisional Unicode character properties,
15236 though not all are enabled by default.  The omitted ones are the Unihan
15237 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
15238 deprecated or Unicode-internal properties.  (An installation may choose to
15239 recompile Perl's tables to change this.  See L<Unicode character
15240 properties that are NOT accepted by Perl>.)
15241
15242 For most purposes, access to Unicode properties from the Perl core is through
15243 regular expression matches, as described in the next section.
15244 For some special purposes, and to access the properties that are not suitable
15245 for regular expression matching, all the Unicode character properties that
15246 Perl handles are accessible via the standard L<Unicode::UCD> module, as
15247 described in the section L</Properties accessible through Unicode::UCD>.
15248
15249 Perl also provides some additional extensions and short-cut synonyms
15250 for Unicode properties.
15251
15252 This document merely lists all available properties and does not attempt to
15253 explain what each property really means.  There is a brief description of each
15254 Perl extension; see L<perlunicode/Other Properties> for more information on
15255 these.  There is some detail about Blocks, Scripts, General_Category,
15256 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
15257 official Unicode properties, refer to the Unicode standard.  A good starting
15258 place is L<$unicode_reference_url>.
15259
15260 Note that you can define your own properties; see
15261 L<perlunicode/"User-Defined Character Properties">.
15262
15263 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
15264
15265 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
15266 most of the Unicode character properties.  The table below shows all these
15267 constructs, both single and compound forms.
15268
15269 B<Compound forms> consist of two components, separated by an equals sign or a
15270 colon.  The first component is the property name, and the second component is
15271 the particular value of the property to match against, for example,
15272 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
15273 whose Script property is Greek.
15274
15275 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
15276 their equivalent compound forms.  The table shows these equivalences.  (In our
15277 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
15278 There are also a few Perl-defined single forms that are not shortcuts for a
15279 compound form.  One such is C<\\p{Word}>.  These are also listed in the table.
15280
15281 In parsing these constructs, Perl always ignores Upper/lower case differences
15282 everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
15283 C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
15284 the left brace completely changes the meaning of the construct, from "match"
15285 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
15286 for improved legibility.
15287
15288 Also, white space, hyphens, and underscores are also normally ignored
15289 everywhere between the {braces}, and hence can be freely added or removed
15290 even if the C</x> modifier hasn't been specified on the regular expression.
15291 But $a_bold_stricter at the beginning of an entry in the table below
15292 means that tighter (stricter) rules are used for that entry:
15293
15294 =over 4
15295
15296 =item Single form (C<\\p{name}>) tighter rules:
15297
15298 White space, hyphens, and underscores ARE significant
15299 except for:
15300
15301 =over 4
15302
15303 =item * white space adjacent to a non-word character
15304
15305 =item * underscores separating digits in numbers
15306
15307 =back
15308
15309 That means, for example, that you can freely add or remove white space
15310 adjacent to (but within) the braces without affecting the meaning.
15311
15312 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
15313
15314 The tighter rules given above for the single form apply to everything to the
15315 right of the colon or equals; the looser rules still apply to everything to
15316 the left.
15317
15318 That means, for example, that you can freely add or remove white space
15319 adjacent to (but within) the braces and the colon or equal sign.
15320
15321 =back
15322
15323 Some properties are considered obsolete by Unicode, but still available.
15324 There are several varieties of obsolescence:
15325
15326 =over 4
15327
15328 =item Stabilized
15329
15330 A property may be stabilized.  Such a determination does not indicate
15331 that the property should or should not be used; instead it is a declaration
15332 that the property will not be maintained nor extended for newly encoded
15333 characters.  Such properties are marked with $a_bold_stabilized in the
15334 table.
15335
15336 =item Deprecated
15337
15338 A property may be deprecated, perhaps because its original intent
15339 has been replaced by another property, or because its specification was
15340 somehow defective.  This means that its use is strongly
15341 discouraged, so much so that a warning will be issued if used, unless the
15342 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
15343 statement.  $A_bold_deprecated flags each such entry in the table, and
15344 the entry there for the longest, most descriptive version of the property will
15345 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
15346 warning, even for properties that aren't officially deprecated by Unicode,
15347 when there used to be characters or code points that were matched by them, but
15348 no longer.  This is to warn you that your program may not work like it did on
15349 earlier Unicode releases.
15350
15351 A deprecated property may be made unavailable in a future Perl version, so it
15352 is best to move away from them.
15353
15354 A deprecated property may also be stabilized, but this fact is not shown.
15355
15356 =item Obsolete
15357
15358 Properties marked with $a_bold_obsolete in the table are considered (plain)
15359 obsolete.  Generally this designation is given to properties that Unicode once
15360 used for internal purposes (but not any longer).
15361
15362 =back
15363
15364 Some Perl extensions are present for backwards compatibility and are
15365 discouraged from being used, but are not obsolete.  $A_bold_discouraged
15366 flags each such entry in the table.  Future Unicode versions may force
15367 some of these extensions to be removed without warning, replaced by another
15368 property with the same name that means something different.  Use the
15369 equivalent shown instead.
15370
15371 @block_warning
15372
15373 The table below has two columns.  The left column contains the C<\\p{}>
15374 constructs to look up, possibly preceded by the flags mentioned above; and
15375 the right column contains information about them, like a description, or
15376 synonyms.  It shows both the single and compound forms for each property that
15377 has them.  If the left column is a short name for a property, the right column
15378 will give its longer, more descriptive name; and if the left column is the
15379 longest name, the right column will show any equivalent shortest name, in both
15380 single and compound forms if applicable.
15381
15382 The right column will also caution you if a property means something different
15383 than what might normally be expected.
15384
15385 All single forms are Perl extensions; a few compound forms are as well, and
15386 are noted as such.
15387
15388 Numbers in (parentheses) indicate the total number of code points matched by
15389 the property.  For emphasis, those properties that match no code points at all
15390 are listed as well in a separate section following the table.
15391
15392 Most properties match the same code points regardless of whether C<"/i">
15393 case-insensitive matching is specified or not.  But a few properties are
15394 affected.  These are shown with the notation
15395
15396  (/i= other_property)
15397
15398 in the second column.  Under case-insensitive matching they match the
15399 same code pode points as the property "other_property".
15400
15401 There is no description given for most non-Perl defined properties (See
15402 L<$unicode_reference_url> for that).
15403
15404 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
15405 combinations.  For example, entries like:
15406
15407  \\p{Gc: *}                                  \\p{General_Category: *}
15408
15409 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
15410 for the latter is also valid for the former.  Similarly,
15411
15412  \\p{Is_*}                                   \\p{*}
15413
15414 means that if and only if, for example, C<\\p{Foo}> exists, then
15415 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
15416 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
15417 C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
15418 underscore.
15419
15420 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
15421 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
15422 'N*' to indicate this, and doesn't have separate entries for the other
15423 possibilities.  Note that not all properties which have values 'Yes' and 'No'
15424 are binary, and they have all their values spelled out without using this wild
15425 card, and a C<NOT> clause in their description that highlights their not being
15426 binary.  These also require the compound form to match them, whereas true
15427 binary properties have both single and compound forms available.
15428
15429 Note that all non-essential underscores are removed in the display of the
15430 short names below.
15431
15432 B<Legend summary:>
15433
15434 =over 4
15435
15436 =item Z<>B<*> is a wild-card
15437
15438 =item B<(\\d+)> in the info column gives the number of code points matched by
15439 this property.
15440
15441 =item B<$DEPRECATED> means this is deprecated.
15442
15443 =item B<$OBSOLETE> means this is obsolete.
15444
15445 =item B<$STABILIZED> means this is stabilized.
15446
15447 =item B<$STRICTER> means tighter (stricter) name matching applies.
15448
15449 =item B<$DISCOURAGED> means use of this form is discouraged, and may not be
15450 stable.
15451
15452 =back
15453
15454 $formatted_properties
15455
15456 $zero_matches
15457
15458 =head1 Properties accessible through Unicode::UCD
15459
15460 All the Unicode character properties mentioned above (except for those marked
15461 as for internal use by Perl) are also accessible by
15462 L<Unicode::UCD/prop_invlist()>.
15463
15464 Due to their nature, not all Unicode character properties are suitable for
15465 regular expression matches, nor C<prop_invlist()>.  The remaining
15466 non-provisional, non-internal ones are accessible via
15467 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
15468 hasn't included; see L<below for which those are|/Unicode character properties
15469 that are NOT accepted by Perl>).
15470
15471 For compatibility with other parts of Perl, all the single forms given in the
15472 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
15473 are recognized.  BUT, there are some ambiguities between some Perl extensions
15474 and the Unicode properties, all of which are silently resolved in favor of the
15475 official Unicode property.  To avoid surprises, you should only use
15476 C<prop_invmap()> for forms listed in the table below, which omits the
15477 non-recommended ones.  The affected forms are the Perl single form equivalents
15478 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
15479 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
15480 whose short name is C<sc>.  The table indicates the current ambiguities in the
15481 INFO column, beginning with the word C<"NOT">.
15482
15483 The standard Unicode properties listed below are documented in
15484 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
15485 L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
15486 L<perlunicode/Other Properties>;
15487
15488 The first column in the table is a name for the property; the second column is
15489 an alternative name, if any, plus possibly some annotations.  The alternative
15490 name is the property's full name, unless that would simply repeat the first
15491 column, in which case the second column indicates the property's short name
15492 (if different).  The annotations are given only in the entry for the full
15493 name.  If a property is obsolete, etc, the entry will be flagged with the same
15494 characters used in the table in the L<section above|/Properties accessible
15495 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
15496
15497 $ucd_pod
15498
15499 =head1 Properties accessible through other means
15500
15501 Certain properties are accessible also via core function calls.  These are:
15502
15503  Lowercase_Mapping          lc() and lcfirst()
15504  Titlecase_Mapping          ucfirst()
15505  Uppercase_Mapping          uc()
15506
15507 Also, Case_Folding is accessible through the C</i> modifier in regular
15508 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
15509 operator.
15510
15511 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
15512 interpolation in double-quoted strings and regular expressions; and functions
15513 C<charnames::viacode()>, C<charnames::vianame()>, and
15514 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
15515 specified.
15516
15517 Finally, most properties related to decomposition are accessible via
15518 L<Unicode::Normalize>.
15519
15520 =head1 Unicode character properties that are NOT accepted by Perl
15521
15522 Perl will generate an error for a few character properties in Unicode when
15523 used in a regular expression.  The non-Unihan ones are listed below, with the
15524 reasons they are not accepted, perhaps with work-arounds.  The short names for
15525 the properties are listed enclosed in (parentheses).
15526 As described after the list, an installation can change the defaults and choose
15527 to accept any of these.  The list is machine generated based on the
15528 choices made for the installation that generated this document.
15529
15530 @bad_re_properties
15531
15532 An installation can choose to allow any of these to be matched by downloading
15533 the Unicode database from L<http://www.unicode.org/Public/> to
15534 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
15535 controlling lists contained in the program
15536 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
15537 (C<\%Config> is available from the Config module).
15538
15539 =head1 Other information in the Unicode data base
15540
15541 The Unicode data base is delivered in two different formats.  The XML version
15542 is valid for more modern Unicode releases.  The other version is a collection
15543 of files.  The two are intended to give equivalent information.  Perl uses the
15544 older form; this allows you to recompile Perl to use early Unicode releases.
15545
15546 The only non-character property that Perl currently supports is Named
15547 Sequences, in which a sequence of code points
15548 is given a name and generally treated as a single entity.  (Perl supports
15549 these via the C<\\N{...}> double-quotish construct,
15550 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
15551
15552 Below is a list of the files in the Unicode data base that Perl doesn't
15553 currently use, along with very brief descriptions of their purposes.
15554 Some of the names of the files have been shortened from those that Unicode
15555 uses, in order to allow them to be distinguishable from similarly named files
15556 on file systems for which only the first 8 characters of a name are
15557 significant.
15558
15559 =over 4
15560
15561 @unused_files
15562
15563 =back
15564
15565 =head1 SEE ALSO
15566
15567 L<$unicode_reference_url>
15568
15569 L<perlrecharclass>
15570
15571 L<perlunicode>
15572
15573 END
15574
15575     # And write it.  The 0 means no utf8.
15576     main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
15577     return;
15578 }
15579
15580 sub make_Heavy () {
15581     # Create and write Heavy.pl, which passes info about the tables to
15582     # utf8_heavy.pl
15583
15584     # Stringify structures for output
15585     my $loose_property_name_of
15586                            = simple_dumper(\%loose_property_name_of, ' ' x 4);
15587     chomp $loose_property_name_of;
15588
15589     my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
15590     chomp $stricter_to_file_of;
15591
15592     my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
15593     chomp $loose_to_file_of;
15594
15595     my $nv_floating_to_rational
15596                            = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
15597     chomp $nv_floating_to_rational;
15598
15599     my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
15600     chomp $why_deprecated;
15601
15602     # We set the key to the file when we associated files with tables, but we
15603     # couldn't do the same for the value then, as we might not have the file
15604     # for the alternate table figured out at that time.
15605     foreach my $cased (keys %caseless_equivalent_to) {
15606         my @path = $caseless_equivalent_to{$cased}->file_path;
15607         my $path = join '/', @path[1, -1];
15608         $caseless_equivalent_to{$cased} = $path;
15609     }
15610     my $caseless_equivalent_to
15611                            = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
15612     chomp $caseless_equivalent_to;
15613
15614     my $loose_property_to_file_of
15615                         = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
15616     chomp $loose_property_to_file_of;
15617
15618     my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
15619     chomp $file_to_swash_name;
15620
15621     my @heavy = <<END;
15622 $HEADER
15623 $INTERNAL_ONLY_HEADER
15624
15625 # This file is for the use of utf8_heavy.pl and Unicode::UCD
15626
15627 # Maps Unicode (not Perl single-form extensions) property names in loose
15628 # standard form to their corresponding standard names
15629 \%utf8::loose_property_name_of = (
15630 $loose_property_name_of
15631 );
15632
15633 # Maps property, table to file for those using stricter matching
15634 \%utf8::stricter_to_file_of = (
15635 $stricter_to_file_of
15636 );
15637
15638 # Maps property, table to file for those using loose matching
15639 \%utf8::loose_to_file_of = (
15640 $loose_to_file_of
15641 );
15642
15643 # Maps floating point to fractional form
15644 \%utf8::nv_floating_to_rational = (
15645 $nv_floating_to_rational
15646 );
15647
15648 # If a floating point number doesn't have enough digits in it to get this
15649 # close to a fraction, it isn't considered to be that fraction even if all the
15650 # digits it does have match.
15651 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
15652
15653 # Deprecated tables to generate a warning for.  The key is the file containing
15654 # the table, so as to avoid duplication, as many property names can map to the
15655 # file, but we only need one entry for all of them.
15656 \%utf8::why_deprecated = (
15657 $why_deprecated
15658 );
15659
15660 # A few properties have different behavior under /i matching.  This maps
15661 # those to substitute files to use under /i.
15662 \%utf8::caseless_equivalent = (
15663 $caseless_equivalent_to
15664 );
15665
15666 # Property names to mapping files
15667 \%utf8::loose_property_to_file_of = (
15668 $loose_property_to_file_of
15669 );
15670
15671 # Files to the swash names within them.
15672 \%utf8::file_to_swash_name = (
15673 $file_to_swash_name
15674 );
15675
15676 1;
15677 END
15678
15679     main::write("Heavy.pl", 0, \@heavy);  # The 0 means no utf8.
15680     return;
15681 }
15682
15683 sub make_Name_pm () {
15684     # Create and write Name.pm, which contains subroutines and data to use in
15685     # conjunction with Name.pl
15686
15687     # Maybe there's nothing to do.
15688     return unless $has_hangul_syllables || @code_points_ending_in_code_point;
15689
15690     my @name = <<END;
15691 $HEADER
15692 $INTERNAL_ONLY_HEADER
15693 END
15694
15695     # Convert these structures to output format.
15696     my $code_points_ending_in_code_point =
15697         main::simple_dumper(\@code_points_ending_in_code_point,
15698                             ' ' x 8);
15699     my $names = main::simple_dumper(\%names_ending_in_code_point,
15700                                     ' ' x 8);
15701     my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
15702                                     ' ' x 8);
15703
15704     # Do the same with the Hangul names,
15705     my $jamo;
15706     my $jamo_l;
15707     my $jamo_v;
15708     my $jamo_t;
15709     my $jamo_re;
15710     if ($has_hangul_syllables) {
15711
15712         # Construct a regular expression of all the possible
15713         # combinations of the Hangul syllables.
15714         my @L_re;   # Leading consonants
15715         for my $i ($LBase .. $LBase + $LCount - 1) {
15716             push @L_re, $Jamo{$i}
15717         }
15718         my @V_re;   # Middle vowels
15719         for my $i ($VBase .. $VBase + $VCount - 1) {
15720             push @V_re, $Jamo{$i}
15721         }
15722         my @T_re;   # Trailing consonants
15723         for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
15724             push @T_re, $Jamo{$i}
15725         }
15726
15727         # The whole re is made up of the L V T combination.
15728         $jamo_re = '('
15729                     . join ('|', sort @L_re)
15730                     . ')('
15731                     . join ('|', sort @V_re)
15732                     . ')('
15733                     . join ('|', sort @T_re)
15734                     . ')?';
15735
15736         # These hashes needed by the algorithm were generated
15737         # during reading of the Jamo.txt file
15738         $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
15739         $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
15740         $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
15741         $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
15742     }
15743
15744     push @name, <<END;
15745
15746 package charnames;
15747
15748 # This module contains machine-generated tables and code for the
15749 # algorithmically-determinable Unicode character names.  The following
15750 # routines can be used to translate between name and code point and vice versa
15751
15752 { # Closure
15753
15754     # Matches legal code point.  4-6 hex numbers, If there are 6, the first
15755     # two must be 10; if there are 5, the first must not be a 0.  Written this
15756     # way to decrease backtracking.  The first regex allows the code point to
15757     # be at the end of a word, but to work properly, the word shouldn't end
15758     # with a valid hex character.  The second one won't match a code point at
15759     # the end of a word, and doesn't have the run-on issue
15760     my \$run_on_code_point_re = qr/$run_on_code_point_re/;
15761     my \$code_point_re = qr/$code_point_re/;
15762
15763     # In the following hash, the keys are the bases of names which includes
15764     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The values
15765     # of each key is another hash which is used to get the low and high ends
15766     # for each range of code points that apply to the name.
15767     my %names_ending_in_code_point = (
15768 $names
15769     );
15770
15771     # The following hash is a copy of the previous one, except is for loose
15772     # matching, so each name has blanks and dashes squeezed out
15773     my %loose_names_ending_in_code_point = (
15774 $loose_names
15775     );
15776
15777     # And the following array gives the inverse mapping from code points to
15778     # names.  Lowest code points are first
15779     my \@code_points_ending_in_code_point = (
15780 $code_points_ending_in_code_point
15781     );
15782 END
15783     # Earlier releases didn't have Jamos.  No sense outputting
15784     # them unless will be used.
15785     if ($has_hangul_syllables) {
15786         push @name, <<END;
15787
15788     # Convert from code point to Jamo short name for use in composing Hangul
15789     # syllable names
15790     my %Jamo = (
15791 $jamo
15792     );
15793
15794     # Leading consonant (can be null)
15795     my %Jamo_L = (
15796 $jamo_l
15797     );
15798
15799     # Vowel
15800     my %Jamo_V = (
15801 $jamo_v
15802     );
15803
15804     # Optional trailing consonant
15805     my %Jamo_T = (
15806 $jamo_t
15807     );
15808
15809     # Computed re that splits up a Hangul name into LVT or LV syllables
15810     my \$syllable_re = qr/$jamo_re/;
15811
15812     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
15813     my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
15814
15815     # These constants names and values were taken from the Unicode standard,
15816     # version 5.1, section 3.12.  They are used in conjunction with Hangul
15817     # syllables
15818     my \$SBase = $SBase_string;
15819     my \$LBase = $LBase_string;
15820     my \$VBase = $VBase_string;
15821     my \$TBase = $TBase_string;
15822     my \$SCount = $SCount;
15823     my \$LCount = $LCount;
15824     my \$VCount = $VCount;
15825     my \$TCount = $TCount;
15826     my \$NCount = \$VCount * \$TCount;
15827 END
15828     } # End of has Jamos
15829
15830     push @name, << 'END';
15831
15832     sub name_to_code_point_special {
15833         my ($name, $loose) = @_;
15834
15835         # Returns undef if not one of the specially handled names; otherwise
15836         # returns the code point equivalent to the input name
15837         # $loose is non-zero if to use loose matching, 'name' in that case
15838         # must be input as upper case with all blanks and dashes squeezed out.
15839 END
15840     if ($has_hangul_syllables) {
15841         push @name, << 'END';
15842
15843         if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
15844             || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
15845         {
15846             return if $name !~ qr/^$syllable_re$/;
15847             my $L = $Jamo_L{$1};
15848             my $V = $Jamo_V{$2};
15849             my $T = (defined $3) ? $Jamo_T{$3} : 0;
15850             return ($L * $VCount + $V) * $TCount + $T + $SBase;
15851         }
15852 END
15853     }
15854     push @name, << 'END';
15855
15856         # Name must end in 'code_point' for this to handle.
15857         return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
15858                    || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
15859
15860         my $base = $1;
15861         my $code_point = CORE::hex $2;
15862         my $names_ref;
15863
15864         if ($loose) {
15865             $names_ref = \%loose_names_ending_in_code_point;
15866         }
15867         else {
15868             return if $base !~ s/-$//;
15869             $names_ref = \%names_ending_in_code_point;
15870         }
15871
15872         # Name must be one of the ones which has the code point in it.
15873         return if ! $names_ref->{$base};
15874
15875         # Look through the list of ranges that apply to this name to see if
15876         # the code point is in one of them.
15877         for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
15878             return if $names_ref->{$base}{'low'}->[$i] > $code_point;
15879             next if $names_ref->{$base}{'high'}->[$i] < $code_point;
15880
15881             # Here, the code point is in the range.
15882             return $code_point;
15883         }
15884
15885         # Here, looked like the name had a code point number in it, but
15886         # did not match one of the valid ones.
15887         return;
15888     }
15889
15890     sub code_point_to_name_special {
15891         my $code_point = shift;
15892
15893         # Returns the name of a code point if algorithmically determinable;
15894         # undef if not
15895 END
15896     if ($has_hangul_syllables) {
15897         push @name, << 'END';
15898
15899         # If in the Hangul range, calculate the name based on Unicode's
15900         # algorithm
15901         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
15902             use integer;
15903             my $SIndex = $code_point - $SBase;
15904             my $L = $LBase + $SIndex / $NCount;
15905             my $V = $VBase + ($SIndex % $NCount) / $TCount;
15906             my $T = $TBase + $SIndex % $TCount;
15907             $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
15908             $name .= $Jamo{$T} if $T != $TBase;
15909             return $name;
15910         }
15911 END
15912     }
15913     push @name, << 'END';
15914
15915         # Look through list of these code points for one in range.
15916         foreach my $hash (@code_points_ending_in_code_point) {
15917             return if $code_point < $hash->{'low'};
15918             if ($code_point <= $hash->{'high'}) {
15919                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
15920             }
15921         }
15922         return;            # None found
15923     }
15924 } # End closure
15925
15926 1;
15927 END
15928
15929     main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
15930     return;
15931 }
15932
15933 sub make_UCD () {
15934     # Create and write UCD.pl, which passes info about the tables to
15935     # Unicode::UCD
15936
15937     # Create a mapping from each alias of Perl single-form extensions to all
15938     # its equivalent aliases, for quick look-up.
15939     my %perlprop_to_aliases;
15940     foreach my $table ($perl->tables) {
15941
15942         # First create the list of the aliases of each extension
15943         my @aliases_list;    # List of legal aliases for this extension
15944
15945         my $table_name = $table->name;
15946         my $standard_table_name = standardize($table_name);
15947         my $table_full_name = $table->full_name;
15948         my $standard_table_full_name = standardize($table_full_name);
15949
15950         # Make sure that the list has both the short and full names
15951         push @aliases_list, $table_name, $table_full_name;
15952
15953         my $found_ucd = 0;  # ? Did we actually get an alias that should be
15954                             # output for this table
15955
15956         # Go through all the aliases (including the two just added), and add
15957         # any new unique ones to the list
15958         foreach my $alias ($table->aliases) {
15959
15960             # Skip non-legal names
15961             next unless $alias->ok_as_filename;
15962             next unless $alias->ucd;
15963
15964             $found_ucd = 1;     # have at least one legal name
15965
15966             my $name = $alias->name;
15967             my $standard = standardize($name);
15968
15969             # Don't repeat a name that is equivalent to one already on the
15970             # list
15971             next if $standard eq $standard_table_name;
15972             next if $standard eq $standard_table_full_name;
15973
15974             push @aliases_list, $name;
15975         }
15976
15977         # If there were no legal names, don't output anything.
15978         next unless $found_ucd;
15979
15980         # To conserve memory in the program reading these in, omit full names
15981         # that are identical to the short name, when those are the only two
15982         # aliases for the property.
15983         if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
15984             pop @aliases_list;
15985         }
15986
15987         # Here, @aliases_list is the list of all the aliases that this
15988         # extension legally has.  Now can create a map to it from each legal
15989         # standardized alias
15990         foreach my $alias ($table->aliases) {
15991             next unless $alias->ucd;
15992             next unless $alias->ok_as_filename;
15993             push @{$perlprop_to_aliases{standardize($alias->name)}},
15994                  @aliases_list;
15995         }
15996     }
15997
15998     # Make a list of all combinations of properties/values that are suppressed.
15999     my @suppressed;
16000     if (! $debug_skip) {    # This tends to fail in this debug mode
16001         foreach my $property_name (keys %why_suppressed) {
16002
16003             # Just the value
16004             my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
16005
16006             # The hash may contain properties not in this release of Unicode
16007             next unless defined (my $property = property_ref($property_name));
16008
16009             # Find all combinations
16010             foreach my $prop_alias ($property->aliases) {
16011                 my $prop_alias_name = standardize($prop_alias->name);
16012
16013                 # If no =value, there's just one combination possibe for this
16014                 if (! $value_name) {
16015
16016                     # The property may be suppressed, but there may be a proxy
16017                     # for it, so it shouldn't be listed as suppressed
16018                     next if $prop_alias->ucd;
16019                     push @suppressed, $prop_alias_name;
16020                 }
16021                 else {  # Otherwise
16022                     foreach my $value_alias
16023                                     ($property->table($value_name)->aliases)
16024                     {
16025                         next if $value_alias->ucd;
16026
16027                         push @suppressed, "$prop_alias_name="
16028                                         .  standardize($value_alias->name);
16029                     }
16030                 }
16031             }
16032         }
16033     }
16034
16035     # Convert the structure below (designed for Name.pm) to a form that UCD
16036     # wants, so it doesn't have to modify it at all; i.e. so that it includes
16037     # an element for the Hangul syllables in the appropriate place, and
16038     # otherwise changes the name to include the "-<code point>" suffix.
16039     my @algorithm_names;
16040     my $done_hangul = 0;
16041
16042     # Copy it linearly.
16043     for my $i (0 .. @code_points_ending_in_code_point - 1) {
16044
16045         # Insert the hanguls in the correct place.
16046         if (! $done_hangul
16047             && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
16048         {
16049             $done_hangul = 1;
16050             push @algorithm_names, { low => $SBase,
16051                                      high => $SBase + $SCount - 1,
16052                                      name => '<hangul syllable>',
16053                                     };
16054         }
16055
16056         # Copy the current entry, modified.
16057         push @algorithm_names, {
16058             low => $code_points_ending_in_code_point[$i]->{'low'},
16059             high => $code_points_ending_in_code_point[$i]->{'high'},
16060             name =>
16061                "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
16062         };
16063     }
16064
16065     # Serialize these structures for output.
16066     my $loose_to_standard_value
16067                           = simple_dumper(\%loose_to_standard_value, ' ' x 4);
16068     chomp $loose_to_standard_value;
16069
16070     my $string_property_loose_to_name
16071                     = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
16072     chomp $string_property_loose_to_name;
16073
16074     my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
16075     chomp $perlprop_to_aliases;
16076
16077     my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
16078     chomp $prop_aliases;
16079
16080     my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
16081     chomp $prop_value_aliases;
16082
16083     my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
16084     chomp $suppressed;
16085
16086     my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
16087     chomp $algorithm_names;
16088
16089     my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
16090     chomp $ambiguous_names;
16091
16092     my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
16093     chomp $loose_defaults;
16094
16095     my @ucd = <<END;
16096 $HEADER
16097 $INTERNAL_ONLY_HEADER
16098
16099 # This file is for the use of Unicode::UCD
16100
16101 # Highest legal Unicode code point
16102 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
16103
16104 # Hangul syllables
16105 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
16106 \$Unicode::UCD::HANGUL_COUNT = $SCount;
16107
16108 # Keys are all the possible "prop=value" combinations, in loose form; values
16109 # are the standard loose name for the 'value' part of the key
16110 \%Unicode::UCD::loose_to_standard_value = (
16111 $loose_to_standard_value
16112 );
16113
16114 # String property loose names to standard loose name
16115 \%Unicode::UCD::string_property_loose_to_name = (
16116 $string_property_loose_to_name
16117 );
16118
16119 # Keys are Perl extensions in loose form; values are each one's list of
16120 # aliases
16121 \%Unicode::UCD::loose_perlprop_to_name = (
16122 $perlprop_to_aliases
16123 );
16124
16125 # Keys are standard property name; values are each one's aliases
16126 \%Unicode::UCD::prop_aliases = (
16127 $prop_aliases
16128 );
16129
16130 # Keys of top level are standard property name; values are keys to another
16131 # hash,  Each one is one of the property's values, in standard form.  The
16132 # values are that prop-val's aliases.  If only one specified, the short and
16133 # long alias are identical.
16134 \%Unicode::UCD::prop_value_aliases = (
16135 $prop_value_aliases
16136 );
16137
16138 # Ordered (by code point ordinal) list of the ranges of code points whose
16139 # names are algorithmically determined.  Each range entry is an anonymous hash
16140 # of the start and end points and a template for the names within it.
16141 \@Unicode::UCD::algorithmic_named_code_points = (
16142 $algorithm_names
16143 );
16144
16145 # The properties that as-is have two meanings, and which must be disambiguated
16146 \%Unicode::UCD::ambiguous_names = (
16147 $ambiguous_names
16148 );
16149
16150 # Keys are the prop-val combinations which are the default values for the
16151 # given property, expressed in standard loose form
16152 \%Unicode::UCD::loose_defaults = (
16153 $loose_defaults
16154 );
16155
16156 # All combinations of names that are suppressed.
16157 # This is actually for UCD.t, so it knows which properties shouldn't have
16158 # entries.  If it got any bigger, would probably want to put it in its own
16159 # file to use memory only when it was needed, in testing.
16160 \@Unicode::UCD::suppressed_properties = (
16161 $suppressed
16162 );
16163
16164 1;
16165 END
16166
16167     main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
16168     return;
16169 }
16170
16171 sub write_all_tables() {
16172     # Write out all the tables generated by this program to files, as well as
16173     # the supporting data structures, pod file, and .t file.
16174
16175     my @writables;              # List of tables that actually get written
16176     my %match_tables_to_write;  # Used to collapse identical match tables
16177                                 # into one file.  Each key is a hash function
16178                                 # result to partition tables into buckets.
16179                                 # Each value is an array of the tables that
16180                                 # fit in the bucket.
16181
16182     # For each property ...
16183     # (sort so that if there is an immutable file name, it has precedence, so
16184     # some other property can't come in and take over its file name.  If b's
16185     # file name is defined, will return 1, meaning to take it first; don't
16186     # care if both defined, as they had better be different anyway.  And the
16187     # property named 'Perl' needs to be first (it doesn't have any immutable
16188     # file name) because empty properties are defined in terms of it's table
16189     # named 'Any'.)
16190     PROPERTY:
16191     foreach my $property (sort { return -1 if $a == $perl;
16192                                  return 1 if $b == $perl;
16193                                  return defined $b->file
16194                                 } property_ref('*'))
16195     {
16196         my $type = $property->type;
16197
16198         # And for each table for that property, starting with the mapping
16199         # table for it ...
16200         TABLE:
16201         foreach my $table($property,
16202
16203                         # and all the match tables for it (if any), sorted so
16204                         # the ones with the shortest associated file name come
16205                         # first.  The length sorting prevents problems of a
16206                         # longer file taking a name that might have to be used
16207                         # by a shorter one.  The alphabetic sorting prevents
16208                         # differences between releases
16209                         sort {  my $ext_a = $a->external_name;
16210                                 return 1 if ! defined $ext_a;
16211                                 my $ext_b = $b->external_name;
16212                                 return -1 if ! defined $ext_b;
16213
16214                                 # But return the non-complement table before
16215                                 # the complement one, as the latter is defined
16216                                 # in terms of the former, and needs to have
16217                                 # the information for the former available.
16218                                 return 1 if $a->complement != 0;
16219                                 return -1 if $b->complement != 0;
16220
16221                                 # Similarly, return a subservient table after
16222                                 # a leader
16223                                 return 1 if $a->leader != $a;
16224                                 return -1 if $b->leader != $b;
16225
16226                                 my $cmp = length $ext_a <=> length $ext_b;
16227
16228                                 # Return result if lengths not equal
16229                                 return $cmp if $cmp;
16230
16231                                 # Alphabetic if lengths equal
16232                                 return $ext_a cmp $ext_b
16233                         } $property->tables
16234                     )
16235         {
16236
16237             # Here we have a table associated with a property.  It could be
16238             # the map table (done first for each property), or one of the
16239             # other tables.  Determine which type.
16240             my $is_property = $table->isa('Property');
16241
16242             my $name = $table->name;
16243             my $complete_name = $table->complete_name;
16244
16245             # See if should suppress the table if is empty, but warn if it
16246             # contains something.
16247             my $suppress_if_empty_warn_if_not
16248                     = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
16249
16250             # Calculate if this table should have any code points associated
16251             # with it or not.
16252             my $expected_empty =
16253
16254                 # $perl should be empty, as well as properties that we just
16255                 # don't do anything with
16256                 ($is_property
16257                     && ($table == $perl
16258                         || grep { $complete_name eq $_ }
16259                                                     @unimplemented_properties
16260                     )
16261                 )
16262
16263                 # Match tables in properties we skipped populating should be
16264                 # empty
16265                 || (! $is_property && ! $property->to_create_match_tables)
16266
16267                 # Tables and properties that are expected to have no code
16268                 # points should be empty
16269                 || $suppress_if_empty_warn_if_not
16270             ;
16271
16272             # Set a boolean if this table is the complement of an empty binary
16273             # table
16274             my $is_complement_of_empty_binary =
16275                 $type == $BINARY &&
16276                 (($table == $property->table('Y')
16277                     && $property->table('N')->is_empty)
16278                 || ($table == $property->table('N')
16279                     && $property->table('Y')->is_empty));
16280
16281             if ($table->is_empty) {
16282
16283                 if ($suppress_if_empty_warn_if_not) {
16284                     $table->set_fate($SUPPRESSED,
16285                                      $suppress_if_empty_warn_if_not);
16286                 }
16287
16288                 # Suppress (by skipping them) expected empty tables.
16289                 next TABLE if $expected_empty;
16290
16291                 # And setup to later output a warning for those that aren't
16292                 # known to be allowed to be empty.  Don't do the warning if
16293                 # this table is a child of another one to avoid duplicating
16294                 # the warning that should come from the parent one.
16295                 if (($table == $property || $table->parent == $table)
16296                     && $table->fate != $SUPPRESSED
16297                     && $table->fate != $MAP_PROXIED
16298                     && ! grep { $complete_name =~ /^$_$/ }
16299                                                     @tables_that_may_be_empty)
16300                 {
16301                     push @unhandled_properties, "$table";
16302                 }
16303
16304                 # An empty table is just the complement of everything.
16305                 $table->set_complement($Any) if $table != $property;
16306             }
16307             elsif ($expected_empty) {
16308                 my $because = "";
16309                 if ($suppress_if_empty_warn_if_not) {
16310                     $because = " because $suppress_if_empty_warn_if_not";
16311                 }
16312
16313                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
16314             }
16315
16316             # Some tables should match everything
16317             my $expected_full =
16318                 ($table->fate == $SUPPRESSED)
16319                 ? 0
16320                 : ($is_property)
16321                   ? # All these types of map tables will be full because
16322                     # they will have been populated with defaults
16323                     ($type == $ENUM || $type == $FORCED_BINARY)
16324
16325                   : # A match table should match everything if its method
16326                     # shows it should
16327                     ($table->matches_all
16328
16329                     # The complement of an empty binary table will match
16330                     # everything
16331                     || $is_complement_of_empty_binary
16332                     )
16333             ;
16334
16335             my $count = $table->count;
16336             if ($expected_full) {
16337                 if ($count != $MAX_UNICODE_CODEPOINTS) {
16338                     Carp::my_carp("$table matches only "
16339                     . clarify_number($count)
16340                     . " Unicode code points but should match "
16341                     . clarify_number($MAX_UNICODE_CODEPOINTS)
16342                     . " (off by "
16343                     .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
16344                     . ").  Proceeding anyway.");
16345                 }
16346
16347                 # Here is expected to be full.  If it is because it is the
16348                 # complement of an (empty) binary table that is to be
16349                 # suppressed, then suppress this one as well.
16350                 if ($is_complement_of_empty_binary) {
16351                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
16352                     my $opposing = $property->table($opposing_name);
16353                     my $opposing_status = $opposing->status;
16354                     if ($opposing_status) {
16355                         $table->set_status($opposing_status,
16356                                            $opposing->status_info);
16357                     }
16358                 }
16359             }
16360             elsif ($count == $MAX_UNICODE_CODEPOINTS
16361                    && ($table == $property || $table->leader == $table)
16362                    && $table->property->status ne $PLACEHOLDER)
16363             {
16364                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
16365             }
16366
16367             if ($table->fate >= $SUPPRESSED) {
16368                 if (! $is_property) {
16369                     my @children = $table->children;
16370                     foreach my $child (@children) {
16371                         if ($child->fate < $SUPPRESSED) {
16372                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
16373                         }
16374                     }
16375                 }
16376                 next TABLE;
16377
16378             }
16379
16380             if (! $is_property) {
16381
16382                 make_ucd_table_pod_entries($table) if $table->property == $perl;
16383
16384                 # Several things need to be done just once for each related
16385                 # group of match tables.  Do them on the parent.
16386                 if ($table->parent == $table) {
16387
16388                     # Add an entry in the pod file for the table; it also does
16389                     # the children.
16390                     make_re_pod_entries($table) if defined $pod_directory;
16391
16392                     # See if the the table matches identical code points with
16393                     # something that has already been output.  In that case,
16394                     # no need to have two files with the same code points in
16395                     # them.  We use the table's hash() method to store these
16396                     # in buckets, so that it is quite likely that if two
16397                     # tables are in the same bucket they will be identical, so
16398                     # don't have to compare tables frequently.  The tables
16399                     # have to have the same status to share a file, so add
16400                     # this to the bucket hash.  (The reason for this latter is
16401                     # that Heavy.pl associates a status with a file.)
16402                     # We don't check tables that are inverses of others, as it
16403                     # would lead to some coding complications, and checking
16404                     # all the regular ones should find everything.
16405                     if ($table->complement == 0) {
16406                         my $hash = $table->hash . ';' . $table->status;
16407
16408                         # Look at each table that is in the same bucket as
16409                         # this one would be.
16410                         foreach my $comparison
16411                                             (@{$match_tables_to_write{$hash}})
16412                         {
16413                             if ($table->matches_identically_to($comparison)) {
16414                                 $table->set_equivalent_to($comparison,
16415                                                                 Related => 0);
16416                                 next TABLE;
16417                             }
16418                         }
16419
16420                         # Here, not equivalent, add this table to the bucket.
16421                         push @{$match_tables_to_write{$hash}}, $table;
16422                     }
16423                 }
16424             }
16425             else {
16426
16427                 # Here is the property itself.
16428                 # Don't write out or make references to the $perl property
16429                 next if $table == $perl;
16430
16431                 make_ucd_table_pod_entries($table);
16432
16433                 # There is a mapping stored of the various synonyms to the
16434                 # standardized name of the property for utf8_heavy.pl.
16435                 # Also, the pod file contains entries of the form:
16436                 # \p{alias: *}         \p{full: *}
16437                 # rather than show every possible combination of things.
16438
16439                 my @property_aliases = $property->aliases;
16440
16441                 my $full_property_name = $property->full_name;
16442                 my $property_name = $property->name;
16443                 my $standard_property_name = standardize($property_name);
16444                 my $standard_property_full_name
16445                                         = standardize($full_property_name);
16446
16447                 # We also create for Unicode::UCD a list of aliases for
16448                 # the property.  The list starts with the property name;
16449                 # then its full name.
16450                 my @property_list;
16451                 my @standard_list;
16452                 if ( $property->fate <= $MAP_PROXIED) {
16453                     @property_list = ($property_name, $full_property_name);
16454                     @standard_list = ($standard_property_name,
16455                                         $standard_property_full_name);
16456                 }
16457
16458                 # For each synonym ...
16459                 for my $i (0 .. @property_aliases - 1)  {
16460                     my $alias = $property_aliases[$i];
16461                     my $alias_name = $alias->name;
16462                     my $alias_standard = standardize($alias_name);
16463
16464
16465                     # Add other aliases to the list of property aliases
16466                     if ($property->fate <= $MAP_PROXIED
16467                         && ! grep { $alias_standard eq $_ } @standard_list)
16468                     {
16469                         push @property_list, $alias_name;
16470                         push @standard_list, $alias_standard;
16471                     }
16472
16473                     # For utf8_heavy, set the mapping of the alias to the
16474                     # property
16475                     if ($type == $STRING) {
16476                         if ($property->fate <= $MAP_PROXIED) {
16477                             $string_property_loose_to_name{$alias_standard}
16478                                             = $standard_property_name;
16479                         }
16480                     }
16481                     else {
16482                         if (exists ($loose_property_name_of{$alias_standard}))
16483                         {
16484                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
16485                         }
16486                         else {
16487                             $loose_property_name_of{$alias_standard}
16488                                                 = $standard_property_name;
16489                         }
16490
16491                         # Now for the re pod entry for this alias.  Skip if not
16492                         # outputting a pod; skip the first one, which is the
16493                         # full name so won't have an entry like: '\p{full: *}
16494                         # \p{full: *}', and skip if don't want an entry for
16495                         # this one.
16496                         next if $i == 0
16497                                 || ! defined $pod_directory
16498                                 || ! $alias->make_re_pod_entry;
16499
16500                         my $rhs = "\\p{$full_property_name: *}";
16501                         if ($property != $perl && $table->perl_extension) {
16502                             $rhs .= ' (Perl extension)';
16503                         }
16504                         push @match_properties,
16505                             format_pod_line($indent_info_column,
16506                                         '\p{' . $alias->name . ': *}',
16507                                         $rhs,
16508                                         $alias->status);
16509                     }
16510                 }
16511
16512                 # The list of all possible names is attached to each alias, so
16513                 # lookup is easy
16514                 if (@property_list) {
16515                     push @{$prop_aliases{$standard_list[0]}}, @property_list;
16516                 }
16517
16518                 if ($property->fate <= $MAP_PROXIED) {
16519
16520                     # Similarly, we create for Unicode::UCD a list of
16521                     # property-value aliases.
16522
16523                     my $property_full_name = $property->full_name;
16524
16525                     # Look at each table in the property...
16526                     foreach my $table ($property->tables) {
16527                         my @values_list;
16528                         my $table_full_name = $table->full_name;
16529                         my $standard_table_full_name
16530                                               = standardize($table_full_name);
16531                         my $table_name = $table->name;
16532                         my $standard_table_name = standardize($table_name);
16533
16534                         # The list starts with the table name and its full
16535                         # name.
16536                         push @values_list, $table_name, $table_full_name;
16537
16538                         # We add to the table each unique alias that isn't
16539                         # discouraged from use.
16540                         foreach my $alias ($table->aliases) {
16541                             next if $alias->status
16542                                  && $alias->status eq $DISCOURAGED;
16543                             my $name = $alias->name;
16544                             my $standard = standardize($name);
16545                             next if $standard eq $standard_table_name;
16546                             next if $standard eq $standard_table_full_name;
16547                             push @values_list, $name;
16548                         }
16549
16550                         # Here @values_list is a list of all the aliases for
16551                         # the table.  That is, all the property-values given
16552                         # by this table.  By agreement with Unicode::UCD,
16553                         # if the name and full name are identical, and there
16554                         # are no other names, drop the duplcate entry to save
16555                         # memory.
16556                         if (@values_list == 2
16557                             && $values_list[0] eq $values_list[1])
16558                         {
16559                             pop @values_list
16560                         }
16561
16562                         # To save memory, unlike the similar list for property
16563                         # aliases above, only the standard forms hve the list.
16564                         # This forces an extra step of converting from input
16565                         # name to standard name, but the savings are
16566                         # considerable.  (There is only marginal savings if we
16567                         # did this with the property aliases.)
16568                         push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
16569                     }
16570                 }
16571
16572                 # Don't write out a mapping file if not desired.
16573                 next if ! $property->to_output_map;
16574             }
16575
16576             # Here, we know we want to write out the table, but don't do it
16577             # yet because there may be other tables that come along and will
16578             # want to share the file, and the file's comments will change to
16579             # mention them.  So save for later.
16580             push @writables, $table;
16581
16582         } # End of looping through the property and all its tables.
16583     } # End of looping through all properties.
16584
16585     # Now have all the tables that will have files written for them.  Do it.
16586     foreach my $table (@writables) {
16587         my @directory;
16588         my $filename;
16589         my $property = $table->property;
16590         my $is_property = ($table == $property);
16591         if (! $is_property) {
16592
16593             # Match tables for the property go in lib/$subdirectory, which is
16594             # the property's name.  Don't use the standard file name for this,
16595             # as may get an unfamiliar alias
16596             @directory = ($matches_directory, $property->external_name);
16597         }
16598         else {
16599
16600             @directory = $table->directory;
16601             $filename = $table->file;
16602         }
16603
16604         # Use specified filename if available, or default to property's
16605         # shortest name.  We need an 8.3 safe filename (which means "an 8
16606         # safe" filename, since after the dot is only 'pl', which is < 3)
16607         # The 2nd parameter is if the filename shouldn't be changed, and
16608         # it shouldn't iff there is a hard-coded name for this table.
16609         $filename = construct_filename(
16610                                 $filename || $table->external_name,
16611                                 ! $filename,    # mutable if no filename
16612                                 \@directory);
16613
16614         register_file_for_name($table, \@directory, $filename);
16615
16616         # Only need to write one file when shared by more than one
16617         # property
16618         next if ! $is_property
16619                 && ($table->leader != $table || $table->complement != 0);
16620
16621         # Construct a nice comment to add to the file
16622         $table->set_final_comment;
16623
16624         $table->write;
16625     }
16626
16627
16628     # Write out the pod file
16629     make_pod;
16630
16631     # And Heavy.pl, Name.pm, UCD.pl
16632     make_Heavy;
16633     make_Name_pm;
16634     make_UCD;
16635
16636     make_property_test_script() if $make_test_script;
16637     make_normalization_test_script() if $make_norm_test_script;
16638     return;
16639 }
16640
16641 my @white_space_separators = ( # This used only for making the test script.
16642                             "",
16643                             ' ',
16644                             "\t",
16645                             '   '
16646                         );
16647
16648 sub generate_separator($) {
16649     # This used only for making the test script.  It generates the colon or
16650     # equal separator between the property and property value, with random
16651     # white space surrounding the separator
16652
16653     my $lhs = shift;
16654
16655     return "" if $lhs eq "";  # No separator if there's only one (the r) side
16656
16657     # Choose space before and after randomly
16658     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
16659     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
16660
16661     # And return the whole complex, half the time using a colon, half the
16662     # equals
16663     return $spaces_before
16664             . (rand() < 0.5) ? '=' : ':'
16665             . $spaces_after;
16666 }
16667
16668 sub generate_tests($$$$$) {
16669     # This used only for making the test script.  It generates test cases that
16670     # are expected to compile successfully in perl.  Note that the lhs and
16671     # rhs are assumed to already be as randomized as the caller wants.
16672
16673     my $lhs = shift;           # The property: what's to the left of the colon
16674                                #  or equals separator
16675     my $rhs = shift;           # The property value; what's to the right
16676     my $valid_code = shift;    # A code point that's known to be in the
16677                                # table given by lhs=rhs; undef if table is
16678                                # empty
16679     my $invalid_code = shift;  # A code point known to not be in the table;
16680                                # undef if the table is all code points
16681     my $warning = shift;
16682
16683     # Get the colon or equal
16684     my $separator = generate_separator($lhs);
16685
16686     # The whole 'property=value'
16687     my $name = "$lhs$separator$rhs";
16688
16689     my @output;
16690     # Create a complete set of tests, with complements.
16691     if (defined $valid_code) {
16692         push @output, <<"EOC"
16693 Expect(1, $valid_code, '\\p{$name}', $warning);
16694 Expect(0, $valid_code, '\\p{^$name}', $warning);
16695 Expect(0, $valid_code, '\\P{$name}', $warning);
16696 Expect(1, $valid_code, '\\P{^$name}', $warning);
16697 EOC
16698     }
16699     if (defined $invalid_code) {
16700         push @output, <<"EOC"
16701 Expect(0, $invalid_code, '\\p{$name}', $warning);
16702 Expect(1, $invalid_code, '\\p{^$name}', $warning);
16703 Expect(1, $invalid_code, '\\P{$name}', $warning);
16704 Expect(0, $invalid_code, '\\P{^$name}', $warning);
16705 EOC
16706     }
16707     return @output;
16708 }
16709
16710 sub generate_error($$$) {
16711     # This used only for making the test script.  It generates test cases that
16712     # are expected to not only not match, but to be syntax or similar errors
16713
16714     my $lhs = shift;                # The property: what's to the left of the
16715                                     # colon or equals separator
16716     my $rhs = shift;                # The property value; what's to the right
16717     my $already_in_error = shift;   # Boolean; if true it's known that the
16718                                 # unmodified lhs and rhs will cause an error.
16719                                 # This routine should not force another one
16720     # Get the colon or equal
16721     my $separator = generate_separator($lhs);
16722
16723     # Since this is an error only, don't bother to randomly decide whether to
16724     # put the error on the left or right side; and assume that the rhs is
16725     # loosely matched, again for convenience rather than rigor.
16726     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
16727
16728     my $property = $lhs . $separator . $rhs;
16729
16730     return <<"EOC";
16731 Error('\\p{$property}');
16732 Error('\\P{$property}');
16733 EOC
16734 }
16735
16736 # These are used only for making the test script
16737 # XXX Maybe should also have a bad strict seps, which includes underscore.
16738
16739 my @good_loose_seps = (
16740             " ",
16741             "-",
16742             "\t",
16743             "",
16744             "_",
16745            );
16746 my @bad_loose_seps = (
16747            "/a/",
16748            ':=',
16749           );
16750
16751 sub randomize_stricter_name {
16752     # This used only for making the test script.  Take the input name and
16753     # return a randomized, but valid version of it under the stricter matching
16754     # rules.
16755
16756     my $name = shift;
16757     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16758
16759     # If the name looks like a number (integer, floating, or rational), do
16760     # some extra work
16761     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
16762         my $sign = $1;
16763         my $number = $2;
16764         my $separator = $3;
16765
16766         # If there isn't a sign, part of the time add a plus
16767         # Note: Not testing having any denominator having a minus sign
16768         if (! $sign) {
16769             $sign = '+' if rand() <= .3;
16770         }
16771
16772         # And add 0 or more leading zeros.
16773         $name = $sign . ('0' x int rand(10)) . $number;
16774
16775         if (defined $separator) {
16776             my $extra_zeros = '0' x int rand(10);
16777
16778             if ($separator eq '.') {
16779
16780                 # Similarly, add 0 or more trailing zeros after a decimal
16781                 # point
16782                 $name .= $extra_zeros;
16783             }
16784             else {
16785
16786                 # Or, leading zeros before the denominator
16787                 $name =~ s,/,/$extra_zeros,;
16788             }
16789         }
16790     }
16791
16792     # For legibility of the test, only change the case of whole sections at a
16793     # time.  To do this, first split into sections.  The split returns the
16794     # delimiters
16795     my @sections;
16796     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
16797         trace $section if main::DEBUG && $to_trace;
16798
16799         if (length $section > 1 && $section !~ /\D/) {
16800
16801             # If the section is a sequence of digits, about half the time
16802             # randomly add underscores between some of them.
16803             if (rand() > .5) {
16804
16805                 # Figure out how many underscores to add.  max is 1 less than
16806                 # the number of digits.  (But add 1 at the end to make sure
16807                 # result isn't 0, and compensate earlier by subtracting 2
16808                 # instead of 1)
16809                 my $num_underscores = int rand(length($section) - 2) + 1;
16810
16811                 # And add them evenly throughout, for convenience, not rigor
16812                 use integer;
16813                 my $spacing = (length($section) - 1)/ $num_underscores;
16814                 my $temp = $section;
16815                 $section = "";
16816                 for my $i (1 .. $num_underscores) {
16817                     $section .= substr($temp, 0, $spacing, "") . '_';
16818                 }
16819                 $section .= $temp;
16820             }
16821             push @sections, $section;
16822         }
16823         else {
16824
16825             # Here not a sequence of digits.  Change the case of the section
16826             # randomly
16827             my $switch = int rand(4);
16828             if ($switch == 0) {
16829                 push @sections, uc $section;
16830             }
16831             elsif ($switch == 1) {
16832                 push @sections, lc $section;
16833             }
16834             elsif ($switch == 2) {
16835                 push @sections, ucfirst $section;
16836             }
16837             else {
16838                 push @sections, $section;
16839             }
16840         }
16841     }
16842     trace "returning", join "", @sections if main::DEBUG && $to_trace;
16843     return join "", @sections;
16844 }
16845
16846 sub randomize_loose_name($;$) {
16847     # This used only for making the test script
16848
16849     my $name = shift;
16850     my $want_error = shift;  # if true, make an error
16851     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16852
16853     $name = randomize_stricter_name($name);
16854
16855     my @parts;
16856     push @parts, $good_loose_seps[rand(@good_loose_seps)];
16857
16858     # Preserve trailing ones for the sake of not stripping the underscore from
16859     # 'L_'
16860     for my $part (split /[-\s_]+ (?= . )/, $name) {
16861         if (@parts) {
16862             if ($want_error and rand() < 0.3) {
16863                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
16864                 $want_error = 0;
16865             }
16866             else {
16867                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
16868             }
16869         }
16870         push @parts, $part;
16871     }
16872     my $new = join("", @parts);
16873     trace "$name => $new" if main::DEBUG && $to_trace;
16874
16875     if ($want_error) {
16876         if (rand() >= 0.5) {
16877             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
16878         }
16879         else {
16880             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
16881         }
16882     }
16883     return $new;
16884 }
16885
16886 # Used to make sure don't generate duplicate test cases.
16887 my %test_generated;
16888
16889 sub make_property_test_script() {
16890     # This used only for making the test script
16891     # this written directly -- it's huge.
16892
16893     print "Making test script\n" if $verbosity >= $PROGRESS;
16894
16895     # This uses randomness to test different possibilities without testing all
16896     # possibilities.  To ensure repeatability, set the seed to 0.  But if
16897     # tests are added, it will perturb all later ones in the .t file
16898     srand 0;
16899
16900     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
16901
16902     # Keep going down an order of magnitude
16903     # until find that adding this quantity to
16904     # 1 remains 1; but put an upper limit on
16905     # this so in case this algorithm doesn't
16906     # work properly on some platform, that we
16907     # won't loop forever.
16908     my $digits = 0;
16909     my $min_floating_slop = 1;
16910     while (1+ $min_floating_slop != 1
16911             && $digits++ < 50)
16912     {
16913         my $next = $min_floating_slop / 10;
16914         last if $next == 0; # If underflows,
16915                             # use previous one
16916         $min_floating_slop = $next;
16917     }
16918
16919     # It doesn't matter whether the elements of this array contain single lines
16920     # or multiple lines. main::write doesn't count the lines.
16921     my @output;
16922
16923     foreach my $property (property_ref('*')) {
16924         foreach my $table ($property->tables) {
16925
16926             # Find code points that match, and don't match this table.
16927             my $valid = $table->get_valid_code_point;
16928             my $invalid = $table->get_invalid_code_point;
16929             my $warning = ($table->status eq $DEPRECATED)
16930                             ? "'deprecated'"
16931                             : '""';
16932
16933             # Test each possible combination of the property's aliases with
16934             # the table's.  If this gets to be too many, could do what is done
16935             # in the set_final_comment() for Tables
16936             my @table_aliases = $table->aliases;
16937             my @property_aliases = $table->property->aliases;
16938
16939             # Every property can be optionally be prefixed by 'Is_', so test
16940             # that those work, by creating such a new alias for each
16941             # pre-existing one.
16942             push @property_aliases, map { Alias->new("Is_" . $_->name,
16943                                                     $_->loose_match,
16944                                                     $_->make_re_pod_entry,
16945                                                     $_->ok_as_filename,
16946                                                     $_->status,
16947                                                     $_->ucd,
16948                                                     )
16949                                          } @property_aliases;
16950             my $max = max(scalar @table_aliases, scalar @property_aliases);
16951             for my $j (0 .. $max - 1) {
16952
16953                 # The current alias for property is the next one on the list,
16954                 # or if beyond the end, start over.  Similarly for table
16955                 my $property_name
16956                             = $property_aliases[$j % @property_aliases]->name;
16957
16958                 $property_name = "" if $table->property == $perl;
16959                 my $table_alias = $table_aliases[$j % @table_aliases];
16960                 my $table_name = $table_alias->name;
16961                 my $loose_match = $table_alias->loose_match;
16962
16963                 # If the table doesn't have a file, any test for it is
16964                 # already guaranteed to be in error
16965                 my $already_error = ! $table->file_path;
16966
16967                 # Generate error cases for this alias.
16968                 push @output, generate_error($property_name,
16969                                              $table_name,
16970                                              $already_error);
16971
16972                 # If the table is guaranteed to always generate an error,
16973                 # quit now without generating success cases.
16974                 next if $already_error;
16975
16976                 # Now for the success cases.
16977                 my $random;
16978                 if ($loose_match) {
16979
16980                     # For loose matching, create an extra test case for the
16981                     # standard name.
16982                     my $standard = standardize($table_name);
16983
16984                     # $test_name should be a unique combination for each test
16985                     # case; used just to avoid duplicate tests
16986                     my $test_name = "$property_name=$standard";
16987
16988                     # Don't output duplicate test cases.
16989                     if (! exists $test_generated{$test_name}) {
16990                         $test_generated{$test_name} = 1;
16991                         push @output, generate_tests($property_name,
16992                                                      $standard,
16993                                                      $valid,
16994                                                      $invalid,
16995                                                      $warning,
16996                                                  );
16997                     }
16998                     $random = randomize_loose_name($table_name)
16999                 }
17000                 else { # Stricter match
17001                     $random = randomize_stricter_name($table_name);
17002                 }
17003
17004                 # Now for the main test case for this alias.
17005                 my $test_name = "$property_name=$random";
17006                 if (! exists $test_generated{$test_name}) {
17007                     $test_generated{$test_name} = 1;
17008                     push @output, generate_tests($property_name,
17009                                                  $random,
17010                                                  $valid,
17011                                                  $invalid,
17012                                                  $warning,
17013                                              );
17014
17015                     # If the name is a rational number, add tests for the
17016                     # floating point equivalent.
17017                     if ($table_name =~ qr{/}) {
17018
17019                         # Calculate the float, and find just the fraction.
17020                         my $float = eval $table_name;
17021                         my ($whole, $fraction)
17022                                             = $float =~ / (.*) \. (.*) /x;
17023
17024                         # Starting with one digit after the decimal point,
17025                         # create a test for each possible precision (number of
17026                         # digits past the decimal point) until well beyond the
17027                         # native number found on this machine.  (If we started
17028                         # with 0 digits, it would be an integer, which could
17029                         # well match an unrelated table)
17030                         PLACE:
17031                         for my $i (1 .. $min_floating_slop + 3) {
17032                             my $table_name = sprintf("%.*f", $i, $float);
17033                             if ($i < $MIN_FRACTION_LENGTH) {
17034
17035                                 # If the test case has fewer digits than the
17036                                 # minimum acceptable precision, it shouldn't
17037                                 # succeed, so we expect an error for it.
17038                                 # E.g., 2/3 = .7 at one decimal point, and we
17039                                 # shouldn't say it matches .7.  We should make
17040                                 # it be .667 at least before agreeing that the
17041                                 # intent was to match 2/3.  But at the
17042                                 # less-than- acceptable level of precision, it
17043                                 # might actually match an unrelated number.
17044                                 # So don't generate a test case if this
17045                                 # conflating is possible.  In our example, we
17046                                 # don't want 2/3 matching 7/10, if there is
17047                                 # a 7/10 code point.
17048                                 for my $existing
17049                                         (keys %nv_floating_to_rational)
17050                                 {
17051                                     next PLACE
17052                                         if abs($table_name - $existing)
17053                                                 < $MAX_FLOATING_SLOP;
17054                                 }
17055                                 push @output, generate_error($property_name,
17056                                                              $table_name,
17057                                                              1   # 1 => already an error
17058                                               );
17059                             }
17060                             else {
17061
17062                                 # Here the number of digits exceeds the
17063                                 # minimum we think is needed.  So generate a
17064                                 # success test case for it.
17065                                 push @output, generate_tests($property_name,
17066                                                              $table_name,
17067                                                              $valid,
17068                                                              $invalid,
17069                                                              $warning,
17070                                              );
17071                             }
17072                         }
17073                     }
17074                 }
17075             }
17076         }
17077     }
17078
17079     &write($t_path,
17080            0,           # Not utf8;
17081            [<DATA>,
17082             @output,
17083             (map {"Test_X('$_');\n"} @backslash_X_tests),
17084             "Finished();\n"]);
17085     return;
17086 }
17087
17088 sub make_normalization_test_script() {
17089     print "Making normalization test script\n" if $verbosity >= $PROGRESS;
17090
17091     my $n_path = 'TestNorm.pl';
17092
17093     unshift @normalization_tests, <<'END';
17094 use utf8;
17095 use Test::More;
17096
17097 sub ord_string {    # Convert packed ords to printable string
17098     use charnames ();
17099     return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
17100                                                 unpack "U*", shift) .  "'";
17101     #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
17102 }
17103
17104 sub Test_N {
17105     my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
17106     my $display_source = ord_string($source);
17107     my $display_nfc = ord_string($nfc);
17108     my $display_nfd = ord_string($nfd);
17109     my $display_nfkc = ord_string($nfkc);
17110     my $display_nfkd = ord_string($nfkd);
17111
17112     use Unicode::Normalize;
17113     #    NFC
17114     #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
17115     #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
17116     #
17117     #    NFD
17118     #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
17119     #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
17120     #
17121     #    NFKC
17122     #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
17123     #      toNFKC(nfkc) == toNFKC(nfkd)
17124     #
17125     #    NFKD
17126     #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
17127     #      toNFKD(nfkc) == toNFKD(nfkd)
17128
17129     is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
17130     is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
17131     is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
17132     is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
17133     is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
17134
17135     is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
17136     is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
17137     is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
17138     is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
17139     is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
17140
17141     is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
17142     is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
17143     is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
17144     is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
17145     is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
17146
17147     is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
17148     is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
17149     is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
17150     is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
17151     is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
17152 }
17153 END
17154
17155     &write($n_path,
17156            1,           # Is utf8;
17157            [
17158             @normalization_tests,
17159             'done_testing();'
17160             ]);
17161     return;
17162 }
17163
17164 # This is a list of the input files and how to handle them.  The files are
17165 # processed in their order in this list.  Some reordering is possible if
17166 # desired, but the v0 files should be first, and the extracted before the
17167 # others except DAge.txt (as data in an extracted file can be over-ridden by
17168 # the non-extracted.  Some other files depend on data derived from an earlier
17169 # file, like UnicodeData requires data from Jamo, and the case changing and
17170 # folding requires data from Unicode.  Mostly, it is safest to order by first
17171 # version releases in (except the Jamo).  DAge.txt is read before the
17172 # extracted ones because of the rarely used feature $compare_versions.  In the
17173 # unlikely event that there were ever an extracted file that contained the Age
17174 # property information, it would have to go in front of DAge.
17175 #
17176 # The version strings allow the program to know whether to expect a file or
17177 # not, but if a file exists in the directory, it will be processed, even if it
17178 # is in a version earlier than expected, so you can copy files from a later
17179 # release into an earlier release's directory.
17180 my @input_file_objects = (
17181     Input_file->new('PropertyAliases.txt', v0,
17182                     Handler => \&process_PropertyAliases,
17183                     ),
17184     Input_file->new(undef, v0,  # No file associated with this
17185                     Progress_Message => 'Finishing property setup',
17186                     Handler => \&finish_property_setup,
17187                     ),
17188     Input_file->new('PropValueAliases.txt', v0,
17189                      Handler => \&process_PropValueAliases,
17190                      Has_Missings_Defaults => $NOT_IGNORED,
17191                      ),
17192     Input_file->new('DAge.txt', v3.2.0,
17193                     Has_Missings_Defaults => $NOT_IGNORED,
17194                     Property => 'Age'
17195                     ),
17196     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
17197                     Property => 'General_Category',
17198                     ),
17199     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
17200                     Property => 'Canonical_Combining_Class',
17201                     Has_Missings_Defaults => $NOT_IGNORED,
17202                     ),
17203     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
17204                     Property => 'Numeric_Type',
17205                     Has_Missings_Defaults => $NOT_IGNORED,
17206                     ),
17207     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
17208                     Property => 'East_Asian_Width',
17209                     Has_Missings_Defaults => $NOT_IGNORED,
17210                     ),
17211     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
17212                     Property => 'Line_Break',
17213                     Has_Missings_Defaults => $NOT_IGNORED,
17214                     ),
17215     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
17216                     Property => 'Bidi_Class',
17217                     Has_Missings_Defaults => $NOT_IGNORED,
17218                     ),
17219     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
17220                     Property => 'Decomposition_Type',
17221                     Has_Missings_Defaults => $NOT_IGNORED,
17222                     ),
17223     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
17224     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
17225                     Property => 'Numeric_Value',
17226                     Each_Line_Handler => \&filter_numeric_value_line,
17227                     Has_Missings_Defaults => $NOT_IGNORED,
17228                     ),
17229     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
17230                     Property => 'Joining_Group',
17231                     Has_Missings_Defaults => $NOT_IGNORED,
17232                     ),
17233
17234     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
17235                     Property => 'Joining_Type',
17236                     Has_Missings_Defaults => $NOT_IGNORED,
17237                     ),
17238     Input_file->new('Jamo.txt', v2.0.0,
17239                     Property => 'Jamo_Short_Name',
17240                     Each_Line_Handler => \&filter_jamo_line,
17241                     ),
17242     Input_file->new('UnicodeData.txt', v1.1.5,
17243                     Pre_Handler => \&setup_UnicodeData,
17244
17245                     # We clean up this file for some early versions.
17246                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
17247                                             ? \&filter_v1_ucd
17248                                             : ($v_version eq v2.1.5)
17249                                                 ? \&filter_v2_1_5_ucd
17250
17251                                                 # And for 5.14 Perls with 6.0,
17252                                                 # have to also make changes
17253                                                 : ($v_version ge v6.0.0
17254                                                    && $^V lt v5.17.0)
17255                                                     ? \&filter_v6_ucd
17256                                                     : undef),
17257
17258                                             # Early versions did not have the
17259                                             # proper Unicode_1 names for the
17260                                             # controls
17261                                             (($v_version lt v3.0.0)
17262                                             ? \&filter_early_U1_names
17263                                             : undef),
17264
17265                                             # Early versions did not correctly
17266                                             # use the later method for giving
17267                                             # decimal digit values
17268                                             (($v_version le v3.2.0)
17269                                             ? \&filter_bad_Nd_ucd
17270                                             : undef),
17271
17272                                             # And the main filter
17273                                             \&filter_UnicodeData_line,
17274                                          ],
17275                     EOF_Handler => \&EOF_UnicodeData,
17276                     ),
17277     Input_file->new('ArabicShaping.txt', v2.0.0,
17278                     Each_Line_Handler =>
17279                         [ ($v_version lt 4.1.0)
17280                                     ? \&filter_old_style_arabic_shaping
17281                                     : undef,
17282                         \&filter_arabic_shaping_line,
17283                         ],
17284                     Has_Missings_Defaults => $NOT_IGNORED,
17285                     ),
17286     Input_file->new('Blocks.txt', v2.0.0,
17287                     Property => 'Block',
17288                     Has_Missings_Defaults => $NOT_IGNORED,
17289                     Each_Line_Handler => \&filter_blocks_lines
17290                     ),
17291     Input_file->new('PropList.txt', v2.0.0,
17292                     Each_Line_Handler => (($v_version lt v3.1.0)
17293                                             ? \&filter_old_style_proplist
17294                                             : undef),
17295                     ),
17296     Input_file->new('Unihan.txt', v2.0.0,
17297                     Pre_Handler => \&setup_unihan,
17298                     Optional => 1,
17299                     Each_Line_Handler => \&filter_unihan_line,
17300                         ),
17301     Input_file->new('SpecialCasing.txt', v2.1.8,
17302                     Each_Line_Handler => ($v_version eq 2.1.8)
17303                                          ? \&filter_2_1_8_special_casing_line
17304                                          : \&filter_special_casing_line,
17305                     Pre_Handler => \&setup_special_casing,
17306                     Has_Missings_Defaults => $IGNORED,
17307                     ),
17308     Input_file->new(
17309                     'LineBreak.txt', v3.0.0,
17310                     Has_Missings_Defaults => $NOT_IGNORED,
17311                     Property => 'Line_Break',
17312                     # Early versions had problematic syntax
17313                     Each_Line_Handler => (($v_version lt v3.1.0)
17314                                         ? \&filter_early_ea_lb
17315                                         : undef),
17316                     ),
17317     Input_file->new('EastAsianWidth.txt', v3.0.0,
17318                     Property => 'East_Asian_Width',
17319                     Has_Missings_Defaults => $NOT_IGNORED,
17320                     # Early versions had problematic syntax
17321                     Each_Line_Handler => (($v_version lt v3.1.0)
17322                                         ? \&filter_early_ea_lb
17323                                         : undef),
17324                     ),
17325     Input_file->new('CompositionExclusions.txt', v3.0.0,
17326                     Property => 'Composition_Exclusion',
17327                     ),
17328     Input_file->new('BidiMirroring.txt', v3.0.1,
17329                     Has_Missings_Defaults => ($v_version lt v6.2.0)
17330                                              ? $NO_DEFAULTS
17331                                              # Is <none> which doesn't mean
17332                                              # anything to us, we will use the
17333                                              # null string
17334                                              : $IGNORED,
17335                     Property => 'Bidi_Mirroring_Glyph',
17336                     ),
17337     Input_file->new("NormTest.txt", v3.0.0,
17338                      Handler => \&process_NormalizationsTest,
17339                      Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
17340                     ),
17341     Input_file->new('CaseFolding.txt', v3.0.1,
17342                     Pre_Handler => \&setup_case_folding,
17343                     Each_Line_Handler =>
17344                         [ ($v_version lt v3.1.0)
17345                                  ? \&filter_old_style_case_folding
17346                                  : undef,
17347                            \&filter_case_folding_line
17348                         ],
17349                     Has_Missings_Defaults => $IGNORED,
17350                     ),
17351     Input_file->new('DCoreProperties.txt', v3.1.0,
17352                     # 5.2 changed this file
17353                     Has_Missings_Defaults => (($v_version ge v5.2.0)
17354                                             ? $NOT_IGNORED
17355                                             : $NO_DEFAULTS),
17356                     ),
17357     Input_file->new('Scripts.txt', v3.1.0,
17358                     Property => 'Script',
17359                     Has_Missings_Defaults => $NOT_IGNORED,
17360                     ),
17361     Input_file->new('DNormalizationProps.txt', v3.1.0,
17362                     Has_Missings_Defaults => $NOT_IGNORED,
17363                     Each_Line_Handler => (($v_version lt v4.0.1)
17364                                       ? \&filter_old_style_normalization_lines
17365                                       : undef),
17366                     ),
17367     Input_file->new('HangulSyllableType.txt', v0,
17368                     Has_Missings_Defaults => $NOT_IGNORED,
17369                     Property => 'Hangul_Syllable_Type',
17370                     Pre_Handler => ($v_version lt v4.0.0)
17371                                    ? \&generate_hst
17372                                    : undef,
17373                     ),
17374     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
17375                     Property => 'Word_Break',
17376                     Has_Missings_Defaults => $NOT_IGNORED,
17377                     ),
17378     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
17379                     Property => 'Grapheme_Cluster_Break',
17380                     Has_Missings_Defaults => $NOT_IGNORED,
17381                     Pre_Handler => ($v_version lt v4.1.0)
17382                                    ? \&generate_GCB
17383                                    : undef,
17384                     ),
17385     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
17386                     Handler => \&process_GCB_test,
17387                     ),
17388     Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
17389                     Skip => 'Validation Tests',
17390                     ),
17391     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
17392                     Skip => 'Validation Tests',
17393                     ),
17394     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
17395                     Skip => 'Validation Tests',
17396                     ),
17397     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
17398                     Property => 'Sentence_Break',
17399                     Has_Missings_Defaults => $NOT_IGNORED,
17400                     ),
17401     Input_file->new('NamedSequences.txt', v4.1.0,
17402                     Handler => \&process_NamedSequences
17403                     ),
17404     Input_file->new('NameAliases.txt', v0,
17405                     Property => 'Name_Alias',
17406                     Pre_Handler => ($v_version le v6.0.0)
17407                                    ? \&setup_early_name_alias
17408                                    : undef,
17409                     Each_Line_Handler => ($v_version le v6.0.0)
17410                                    ? \&filter_early_version_name_alias_line
17411                                    : \&filter_later_version_name_alias_line,
17412                     ),
17413     Input_file->new("BidiTest.txt", v5.2.0,
17414                     Skip => 'Validation Tests',
17415                     ),
17416     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
17417                     Optional => 1,
17418                     Each_Line_Handler => \&filter_unihan_line,
17419                     ),
17420     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
17421                     Optional => 1,
17422                     Each_Line_Handler => \&filter_unihan_line,
17423                     ),
17424     Input_file->new('UnihanIRGSources.txt', v5.2.0,
17425                     Optional => 1,
17426                     Pre_Handler => \&setup_unihan,
17427                     Each_Line_Handler => \&filter_unihan_line,
17428                     ),
17429     Input_file->new('UnihanNumericValues.txt', v5.2.0,
17430                     Optional => 1,
17431                     Each_Line_Handler => \&filter_unihan_line,
17432                     ),
17433     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
17434                     Optional => 1,
17435                     Each_Line_Handler => \&filter_unihan_line,
17436                     ),
17437     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
17438                     Optional => 1,
17439                     Each_Line_Handler => \&filter_unihan_line,
17440                     ),
17441     Input_file->new('UnihanReadings.txt', v5.2.0,
17442                     Optional => 1,
17443                     Each_Line_Handler => \&filter_unihan_line,
17444                     ),
17445     Input_file->new('UnihanVariants.txt', v5.2.0,
17446                     Optional => 1,
17447                     Each_Line_Handler => \&filter_unihan_line,
17448                     ),
17449     Input_file->new('ScriptExtensions.txt', v6.0.0,
17450                     Property => 'Script_Extensions',
17451                     Pre_Handler => \&setup_script_extensions,
17452                     Each_Line_Handler => \&filter_script_extensions_line,
17453                     Has_Missings_Defaults => (($v_version le v6.0.0)
17454                                             ? $NO_DEFAULTS
17455                                             : $IGNORED),
17456                     ),
17457     # The two Indic files are actually available starting in v6.0.0, but their
17458     # property values are missing from PropValueAliases.txt in that release,
17459     # so that further work would have to be done to get them to work properly
17460     # for that release.
17461     Input_file->new('IndicMatraCategory.txt', v6.1.0,
17462                     Property => 'Indic_Matra_Category',
17463                     Has_Missings_Defaults => $NOT_IGNORED,
17464                     Skip => "Provisional; for the analysis and processing of Indic scripts",
17465                     ),
17466     Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
17467                     Property => 'Indic_Syllabic_Category',
17468                     Has_Missings_Defaults => $NOT_IGNORED,
17469                     Skip => "Provisional; for the analysis and processing of Indic scripts",
17470                     ),
17471 );
17472
17473 # End of all the preliminaries.
17474 # Do it...
17475
17476 if ($compare_versions) {
17477     Carp::my_carp(<<END
17478 Warning.  \$compare_versions is set.  Output is not suitable for production
17479 END
17480     );
17481 }
17482
17483 # Put into %potential_files a list of all the files in the directory structure
17484 # that could be inputs to this program, excluding those that we should ignore.
17485 # Use absolute file names because it makes it easier across machine types.
17486 my @ignored_files_full_names = map { File::Spec->rel2abs(
17487                                      internal_file_to_platform($_))
17488                                 } keys %ignored_files;
17489 File::Find::find({
17490     wanted=>sub {
17491         return unless /\.txt$/i;  # Some platforms change the name's case
17492         my $full = lc(File::Spec->rel2abs($_));
17493         $potential_files{$full} = 1
17494                     if ! grep { $full eq lc($_) } @ignored_files_full_names;
17495         return;
17496     }
17497 }, File::Spec->curdir());
17498
17499 my @mktables_list_output_files;
17500 my $old_start_time = 0;
17501
17502 if (! -e $file_list) {
17503     print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
17504     $write_unchanged_files = 1;
17505 } elsif ($write_unchanged_files) {
17506     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
17507 }
17508 else {
17509     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
17510     my $file_handle;
17511     if (! open $file_handle, "<", $file_list) {
17512         Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
17513         $glob_list = 1;
17514     }
17515     else {
17516         my @input;
17517
17518         # Read and parse mktables.lst, placing the results from the first part
17519         # into @input, and the second part into @mktables_list_output_files
17520         for my $list ( \@input, \@mktables_list_output_files ) {
17521             while (<$file_handle>) {
17522                 s/^ \s+ | \s+ $//xg;
17523                 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
17524                     $old_start_time = $1;
17525                 }
17526                 next if /^ \s* (?: \# .* )? $/x;
17527                 last if /^ =+ $/x;
17528                 my ( $file ) = split /\t/;
17529                 push @$list, $file;
17530             }
17531             @$list = uniques(@$list);
17532             next;
17533         }
17534
17535         # Look through all the input files
17536         foreach my $input (@input) {
17537             next if $input eq 'version'; # Already have checked this.
17538
17539             # Ignore if doesn't exist.  The checking about whether we care or
17540             # not is done via the Input_file object.
17541             next if ! file_exists($input);
17542
17543             # The paths are stored with relative names, and with '/' as the
17544             # delimiter; convert to absolute on this machine
17545             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
17546             $potential_files{lc $full} = 1
17547                 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
17548         }
17549     }
17550
17551     close $file_handle;
17552 }
17553
17554 if ($glob_list) {
17555
17556     # Here wants to process all .txt files in the directory structure.
17557     # Convert them to full path names.  They are stored in the platform's
17558     # relative style
17559     my @known_files;
17560     foreach my $object (@input_file_objects) {
17561         my $file = $object->file;
17562         next unless defined $file;
17563         push @known_files, File::Spec->rel2abs($file);
17564     }
17565
17566     my @unknown_input_files;
17567     foreach my $file (keys %potential_files) {  # The keys are stored in lc
17568         next if grep { $file eq lc($_) } @known_files;
17569
17570         # Here, the file is unknown to us.  Get relative path name
17571         $file = File::Spec->abs2rel($file);
17572         push @unknown_input_files, $file;
17573
17574         # What will happen is we create a data structure for it, and add it to
17575         # the list of input files to process.  First get the subdirectories
17576         # into an array
17577         my (undef, $directories, undef) = File::Spec->splitpath($file);
17578         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
17579         my @directories = File::Spec->splitdir($directories);
17580
17581         # If the file isn't extracted (meaning none of the directories is the
17582         # extracted one), just add it to the end of the list of inputs.
17583         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
17584             push @input_file_objects, Input_file->new($file, v0);
17585         }
17586         else {
17587
17588             # Here, the file is extracted.  It needs to go ahead of most other
17589             # processing.  Search for the first input file that isn't a
17590             # special required property (that is, find one whose first_release
17591             # is non-0), and isn't extracted.  Also, the Age property file is
17592             # processed before the extracted ones, just in case
17593             # $compare_versions is set.
17594             for (my $i = 0; $i < @input_file_objects; $i++) {
17595                 if ($input_file_objects[$i]->first_released ne v0
17596                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
17597                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
17598                 {
17599                     splice @input_file_objects, $i, 0,
17600                                                 Input_file->new($file, v0);
17601                     last;
17602                 }
17603             }
17604
17605         }
17606     }
17607     if (@unknown_input_files) {
17608         print STDERR simple_fold(join_lines(<<END
17609
17610 The following files are unknown as to how to handle.  Assuming they are
17611 typical property files.  You'll know by later error messages if it worked or
17612 not:
17613 END
17614         ) . " " . join(", ", @unknown_input_files) . "\n\n");
17615     }
17616 } # End of looking through directory structure for more .txt files.
17617
17618 # Create the list of input files from the objects we have defined, plus
17619 # version
17620 my @input_files = 'version';
17621 foreach my $object (@input_file_objects) {
17622     my $file = $object->file;
17623     next if ! defined $file;    # Not all objects have files
17624     next if $object->optional && ! -e $file;
17625     push @input_files,  $file;
17626 }
17627
17628 if ( $verbosity >= $VERBOSE ) {
17629     print "Expecting ".scalar( @input_files )." input files. ",
17630          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
17631 }
17632
17633 # We set $most_recent to be the most recently changed input file, including
17634 # this program itself (done much earlier in this file)
17635 foreach my $in (@input_files) {
17636     next unless -e $in;        # Keep going even if missing a file
17637     my $mod_time = (stat $in)[9];
17638     $most_recent = $mod_time if $mod_time > $most_recent;
17639
17640     # See that the input files have distinct names, to warn someone if they
17641     # are adding a new one
17642     if ($make_list) {
17643         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
17644         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
17645         my @directories = File::Spec->splitdir($directories);
17646         my $base = $file =~ s/\.txt$//;
17647         construct_filename($file, 'mutable', \@directories);
17648     }
17649 }
17650
17651 my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
17652               || ! scalar @mktables_list_output_files  # or if no outputs known
17653               || $old_start_time < $most_recent;       # or out-of-date
17654
17655 # Now we check to see if any output files are older than youngest, if
17656 # they are, we need to continue on, otherwise we can presumably bail.
17657 if (! $rebuild) {
17658     foreach my $out (@mktables_list_output_files) {
17659         if ( ! file_exists($out)) {
17660             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
17661             $rebuild = 1;
17662             last;
17663          }
17664         #local $to_trace = 1 if main::DEBUG;
17665         trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
17666         if ( (stat $out)[9] <= $most_recent ) {
17667             #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
17668             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
17669             $rebuild = 1;
17670             last;
17671         }
17672     }
17673 }
17674 if (! $rebuild) {
17675     print "Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
17676     exit(0);
17677 }
17678 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
17679
17680 # Ready to do the major processing.  First create the perl pseudo-property.
17681 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
17682
17683 # Process each input file
17684 foreach my $file (@input_file_objects) {
17685     $file->run;
17686 }
17687
17688 # Finish the table generation.
17689
17690 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
17691 finish_Unicode();
17692
17693 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
17694 compile_perl();
17695
17696 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
17697 add_perl_synonyms();
17698
17699 print "Writing tables\n" if $verbosity >= $PROGRESS;
17700 write_all_tables();
17701
17702 # Write mktables.lst
17703 if ( $file_list and $make_list ) {
17704
17705     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
17706     foreach my $file (@input_files, @files_actually_output) {
17707         my (undef, $directories, $file) = File::Spec->splitpath($file);
17708         my @directories = File::Spec->splitdir($directories);
17709         $file = join '/', @directories, $file;
17710     }
17711
17712     my $ofh;
17713     if (! open $ofh,">",$file_list) {
17714         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
17715         return
17716     }
17717     else {
17718         my $localtime = localtime $start_time;
17719         print $ofh <<"END";
17720 #
17721 # $file_list -- File list for $0.
17722 #
17723 #   Autogenerated starting on $start_time ($localtime)
17724 #
17725 # - First section is input files
17726 #   ($0 itself is not listed but is automatically considered an input)
17727 # - Section separator is /^=+\$/
17728 # - Second section is a list of output files.
17729 # - Lines matching /^\\s*#/ are treated as comments
17730 #   which along with blank lines are ignored.
17731 #
17732
17733 # Input files:
17734
17735 END
17736         print $ofh "$_\n" for sort(@input_files);
17737         print $ofh "\n=================================\n# Output files:\n\n";
17738         print $ofh "$_\n" for sort @files_actually_output;
17739         print $ofh "\n# ",scalar(@input_files)," input files\n",
17740                 "# ",scalar(@files_actually_output)+1," output files\n\n",
17741                 "# End list\n";
17742         close $ofh
17743             or Carp::my_carp("Failed to close $ofh: $!");
17744
17745         print "Filelist has ",scalar(@input_files)," input files and ",
17746             scalar(@files_actually_output)+1," output files\n"
17747             if $verbosity >= $VERBOSE;
17748     }
17749 }
17750
17751 # Output these warnings unless -q explicitly specified.
17752 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
17753     if (@unhandled_properties) {
17754         print "\nProperties and tables that unexpectedly have no code points\n";
17755         foreach my $property (sort @unhandled_properties) {
17756             print $property, "\n";
17757         }
17758     }
17759
17760     if (%potential_files) {
17761         print "\nInput files that are not considered:\n";
17762         foreach my $file (sort keys %potential_files) {
17763             print File::Spec->abs2rel($file), "\n";
17764         }
17765     }
17766     print "\nAll done\n" if $verbosity >= $VERBOSE;
17767 }
17768 exit(0);
17769
17770 # TRAILING CODE IS USED BY make_property_test_script()
17771 __DATA__
17772
17773 use strict;
17774 use warnings;
17775
17776 # If run outside the normal test suite on an ASCII platform, you can
17777 # just create a latin1_to_native() function that just returns its
17778 # inputs, because that's the only function used from test.pl
17779 require "test.pl";
17780
17781 # Test qr/\X/ and the \p{} regular expression constructs.  This file is
17782 # constructed by mktables from the tables it generates, so if mktables is
17783 # buggy, this won't necessarily catch those bugs.  Tests are generated for all
17784 # feasible properties; a few aren't currently feasible; see
17785 # is_code_point_usable() in mktables for details.
17786
17787 # Standard test packages are not used because this manipulates SIG_WARN.  It
17788 # exits 0 if every non-skipped test succeeded; -1 if any failed.
17789
17790 my $Tests = 0;
17791 my $Fails = 0;
17792
17793 sub Expect($$$$) {
17794     my $expected = shift;
17795     my $ord = shift;
17796     my $regex  = shift;
17797     my $warning_type = shift;   # Type of warning message, like 'deprecated'
17798                                 # or empty if none
17799     my $line   = (caller)[2];
17800     $ord = ord(latin1_to_native(chr($ord)));
17801
17802     # Convert the code point to hex form
17803     my $string = sprintf "\"\\x{%04X}\"", $ord;
17804
17805     my @tests = "";
17806
17807     # The first time through, use all warnings.  If the input should generate
17808     # a warning, add another time through with them turned off
17809     push @tests, "no warnings '$warning_type';" if $warning_type;
17810
17811     foreach my $no_warnings (@tests) {
17812
17813         # Store any warning messages instead of outputting them
17814         local $SIG{__WARN__} = $SIG{__WARN__};
17815         my $warning_message;
17816         $SIG{__WARN__} = sub { $warning_message = $_[0] };
17817
17818         $Tests++;
17819
17820         # A string eval is needed because of the 'no warnings'.
17821         # Assumes no parens in the regular expression
17822         my $result = eval "$no_warnings
17823                             my \$RegObj = qr($regex);
17824                             $string =~ \$RegObj ? 1 : 0";
17825         if (not defined $result) {
17826             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
17827             $Fails++;
17828         }
17829         elsif ($result ^ $expected) {
17830             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
17831             $Fails++;
17832         }
17833         elsif ($warning_message) {
17834             if (! $warning_type || ($warning_type && $no_warnings)) {
17835                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
17836                 $Fails++;
17837             }
17838             else {
17839                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
17840             }
17841         }
17842         elsif ($warning_type && ! $no_warnings) {
17843             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
17844             $Fails++;
17845         }
17846         else {
17847             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
17848         }
17849     }
17850     return;
17851 }
17852
17853 sub Error($) {
17854     my $regex  = shift;
17855     $Tests++;
17856     if (eval { 'x' =~ qr/$regex/; 1 }) {
17857         $Fails++;
17858         my $line = (caller)[2];
17859         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
17860     }
17861     else {
17862         my $line = (caller)[2];
17863         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
17864     }
17865     return;
17866 }
17867
17868 # GCBTest.txt character that separates grapheme clusters
17869 my $breakable_utf8 = my $breakable = chr(0xF7);
17870 utf8::upgrade($breakable_utf8);
17871
17872 # GCBTest.txt character that indicates that the adjoining code points are part
17873 # of the same grapheme cluster
17874 my $nobreak_utf8 = my $nobreak = chr(0xD7);
17875 utf8::upgrade($nobreak_utf8);
17876
17877 sub Test_X($) {
17878     # Test qr/\X/ matches.  The input is a line from auxiliary/GCBTest.txt
17879     # Each such line is a sequence of code points given by their hex numbers,
17880     # separated by the two characters defined just before this subroutine that
17881     # indicate that either there can or cannot be a break between the adjacent
17882     # code points.  If there isn't a break, that means the sequence forms an
17883     # extended grapheme cluster, which means that \X should match the whole
17884     # thing.  If there is a break, \X should stop there.  This is all
17885     # converted by this routine into a match:
17886     #   $string =~ /(\X)/,
17887     # Each \X should match the next cluster; and that is what is checked.
17888
17889     my $template = shift;
17890
17891     my $line   = (caller)[2];
17892
17893     # The line contains characters above the ASCII range, but in Latin1.  It
17894     # may or may not be in utf8, and if it is, it may or may not know it.  So,
17895     # convert these characters to 8 bits.  If knows is in utf8, simply
17896     # downgrade.
17897     if (utf8::is_utf8($template)) {
17898         utf8::downgrade($template);
17899     } else {
17900
17901         # Otherwise, if it is in utf8, but doesn't know it, the next lines
17902         # convert the two problematic characters to their 8-bit equivalents.
17903         # If it isn't in utf8, they don't harm anything.
17904         use bytes;
17905         $template =~ s/$nobreak_utf8/$nobreak/g;
17906         $template =~ s/$breakable_utf8/$breakable/g;
17907     }
17908
17909     # Get rid of the leading and trailing breakables
17910     $template =~ s/^ \s* $breakable \s* //x;
17911     $template =~ s/ \s* $breakable \s* $ //x;
17912
17913     # And no-breaks become just a space.
17914     $template =~ s/ \s* $nobreak \s* / /xg;
17915
17916     # Split the input into segments that are breakable between them.
17917     my @segments = split /\s*$breakable\s*/, $template;
17918
17919     my $string = "";
17920     my $display_string = "";
17921     my @should_match;
17922     my @should_display;
17923
17924     # Convert the code point sequence in each segment into a Perl string of
17925     # characters
17926     foreach my $segment (@segments) {
17927         my @code_points = split /\s+/, $segment;
17928         my $this_string = "";
17929         my $this_display = "";
17930         foreach my $code_point (@code_points) {
17931             $this_string .= latin1_to_native(chr(hex $code_point));
17932             $this_display .= "\\x{$code_point}";
17933         }
17934
17935         # The next cluster should match the string in this segment.
17936         push @should_match, $this_string;
17937         push @should_display, $this_display;
17938         $string .= $this_string;
17939         $display_string .= $this_display;
17940     }
17941
17942     # If a string can be represented in both non-ut8 and utf8, test both cases
17943     UPGRADE:
17944     for my $to_upgrade (0 .. 1) {
17945
17946         if ($to_upgrade) {
17947
17948             # If already in utf8, would just be a repeat
17949             next UPGRADE if utf8::is_utf8($string);
17950
17951             utf8::upgrade($string);
17952         }
17953
17954         # Finally, do the \X match.
17955         my @matches = $string =~ /(\X)/g;
17956
17957         # Look through each matched cluster to verify that it matches what we
17958         # expect.
17959         my $min = (@matches < @should_match) ? @matches : @should_match;
17960         for my $i (0 .. $min - 1) {
17961             $Tests++;
17962             if ($matches[$i] eq $should_match[$i]) {
17963                 print "ok $Tests - ";
17964                 if ($i == 0) {
17965                     print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
17966                 } else {
17967                     print "And \\X #", $i + 1,
17968                 }
17969                 print " correctly matched $should_display[$i]; line $line\n";
17970             } else {
17971                 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
17972                                                     unpack("U*", $matches[$i]));
17973                 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
17974                     $i + 1,
17975                     " should have matched $should_display[$i]",
17976                     " but instead matched $matches[$i]",
17977                     ".  Abandoning rest of line $line\n";
17978                 next UPGRADE;
17979             }
17980         }
17981
17982         # And the number of matches should equal the number of expected matches.
17983         $Tests++;
17984         if (@matches == @should_match) {
17985             print "ok $Tests - Nothing was left over; line $line\n";
17986         } else {
17987             print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
17988         }
17989     }
17990
17991     return;
17992 }
17993
17994 sub Finished() {
17995     print "1..$Tests\n";
17996     exit($Fails ? -1 : 0);
17997 }
17998
17999 Error('\p{Script=InGreek}');    # Bug #69018
18000 Test_X("1100 $nobreak 1161");  # Bug #70940
18001 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18002 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18003 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726