lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # Needs 'no overloading' to run faster on miniperl.  Code commented out at the
   8 # subroutine objaddr can be used instead to work as far back (untested) as
   9 # 5.8: needs pack "U".  But almost all occurrences of objaddr have been
  10 # removed in favor of using 'no overloading'.  You also would have to go
  11 # through and replace occurrences like:
  12 #       my $addr = do { no overloading; pack 'J', $self; }
  13 # with
  14 #       my $addr = main::objaddr $self;
  15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
  16 # that instituted the change to main::objaddr, and subsequent commits that
  17 # changed 0+$self to pack 'J', $self.)
  18
  19 my $start_time;
  20 BEGIN { # Get the time the script started running; do it at compilation to
  21         # get it as close as possible
  22     $start_time= time;
  23 }
  24
  25 require 5.010_001;
  26 use strict;
  27 use warnings;
  28 use Carp;
  29 use Config;
  30 use File::Find;
  31 use File::Path;
  32 use File::Spec;
  33 use Text::Tabs;
  34
  35 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  36 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  37
  38 ##########################################################################
  39 #
  40 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  41 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  42 # a pod file and a .t file
  43 #
  44 # The structure of this file is:
  45 #   First these introductory comments; then
  46 #   code needed for everywhere, such as debugging stuff; then
  47 #   code to handle input parameters; then
  48 #   data structures likely to be of external interest (some of which depend on
  49 #       the input parameters, so follows them; then
  50 #   more data structures and subroutine and package (class) definitions; then
  51 #   the small actual loop to process the input files and finish up; then
  52 #   a __DATA__ section, for the .t tests
  53 #
  54 # This program works on all releases of Unicode through at least 6.0.  The
  55 # outputs have been scrutinized most intently for release 5.1.  The others
  56 # have been checked for somewhat more than just sanity.  It can handle all
  57 # existing Unicode character properties in those releases.
  58 #
  59 # This program is mostly about Unicode character (or code point) properties.
  60 # A property describes some attribute or quality of a code point, like if it
  61 # is lowercase or not, its name, what version of Unicode it was first defined
  62 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  63 # possibilities by making all properties into mappings from each code point
  64 # into some corresponding value.  In the case of it being lowercase or not,
  65 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  66 # property maps each Unicode code point to a single value, called a "property
  67 # value".  (Hence each Unicode property is a true mathematical function with
  68 # exactly one value per code point.)
  69 #
  70 # When using a property in a regular expression, what is desired isn't the
  71 # mapping of the code point to its property's value, but the reverse (or the
  72 # mathematical "inverse relation"): starting with the property value, "Does a
  73 # code point map to it?"  These are written in a "compound" form:
  74 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  75 # files containing the lists of code points that map to each such regular
  76 # expression property value, one file per list
  77 #
  78 # There is also a single form shortcut that Perl adds for many of the commonly
  79 # used properties.  This happens for all binary properties, plus script,
  80 # general_category, and block properties.
  81 #
  82 # Thus the outputs of this program are files.  There are map files, mostly in
  83 # the 'To' directory; and there are list files for use in regular expression
  84 # matching, all in subdirectories of the 'lib' directory, with each
  85 # subdirectory being named for the property that the lists in it are for.
  86 # Bookkeeping, test, and documentation files are also generated.
  87
  88 my $matches_directory = 'lib';   # Where match (\p{}) files go.
  89 my $map_directory = 'To';        # Where map files go.
  90
  91 # DATA STRUCTURES
  92 #
  93 # The major data structures of this program are Property, of course, but also
  94 # Table.  There are two kinds of tables, very similar to each other.
  95 # "Match_Table" is the data structure giving the list of code points that have
  96 # a particular property value, mentioned above.  There is also a "Map_Table"
  97 # data structure which gives the property's mapping from code point to value.
  98 # There are two structures because the match tables need to be combined in
  99 # various ways, such as constructing unions, intersections, complements, etc.,
 100 # and the map ones don't.  And there would be problems, perhaps subtle, if
 101 # a map table were inadvertently operated on in some of those ways.
 102 # The use of separate classes with operations defined on one but not the other
 103 # prevents accidentally confusing the two.
 104 #
 105 # At the heart of each table's data structure is a "Range_List", which is just
 106 # an ordered list of "Ranges", plus ancillary information, and methods to
 107 # operate on them.  A Range is a compact way to store property information.
 108 # Each range has a starting code point, an ending code point, and a value that
 109 # is meant to apply to all the code points between the two end points,
 110 # inclusive.  For a map table, this value is the property value for those
 111 # code points.  Two such ranges could be written like this:
 112 #   0x41 .. 0x5A, 'Upper',
 113 #   0x61 .. 0x7A, 'Lower'
 114 #
 115 # Each range also has a type used as a convenience to classify the values.
 116 # Most ranges in this program will be Type 0, or normal, but there are some
 117 # ranges that have a non-zero type.  These are used only in map tables, and
 118 # are for mappings that don't fit into the normal scheme of things.  Mappings
 119 # that require a hash entry to communicate with utf8.c are one example;
 120 # another example is mappings for charnames.pm to use which indicate a name
 121 # that is algorithmically determinable from its code point (and vice-versa).
 122 # These are used to significantly compact these tables, instead of listing
 123 # each one of the tens of thousands individually.
 124 #
 125 # In a match table, the value of a range is irrelevant (and hence the type as
 126 # well, which will always be 0), and arbitrarily set to the null string.
 127 # Using the example above, there would be two match tables for those two
 128 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 129 # named Lower would contain 0x61..0x7A.
 130 #
 131 # Actually, there are two types of range lists, "Range_Map" is the one
 132 # associated with map tables, and "Range_List" with match tables.
 133 # Again, this is so that methods can be defined on one and not the other so as
 134 # to prevent operating on them in incorrect ways.
 135 #
 136 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 137 # in the perl core.  All tables could in theory be written, but some are
 138 # suppressed because there is no current practical use for them.  It is easy
 139 # to change which get written by changing various lists that are near the top
 140 # of the actual code in this file.  The table data structures contain enough
 141 # ancillary information to allow them to be treated as separate entities for
 142 # writing, such as the path to each one's file.  There is a heading in each
 143 # map table that gives the format of its entries, and what the map is for all
 144 # the code points missing from it.  (This allows tables to be more compact.)
 145 #
 146 # The Property data structure contains one or more tables.  All properties
 147 # contain a map table (except the $perl property which is a
 148 # pseudo-property containing only match tables), and any properties that
 149 # are usable in regular expression matches also contain various matching
 150 # tables, one for each value the property can have.  A binary property can
 151 # have two values, True and False (or Y and N, which are preferred by Unicode
 152 # terminology).  Thus each of these properties will have a map table that
 153 # takes every code point and maps it to Y or N (but having ranges cuts the
 154 # number of entries in that table way down), and two match tables, one
 155 # which has a list of all the code points that map to Y, and one for all the
 156 # code points that map to N.  (For each of these, a third table is also
 157 # generated for the pseudo Perl property.  It contains the identical code
 158 # points as the Y table, but can be written, not in the compound form, but in
 159 # a "single" form like \p{IsUppercase}.)  Many properties are binary, but some
 160 # properties have several possible values, some have many, and properties like
 161 # Name have a different value for every named code point.  Those will not,
 162 # unless the controlling lists are changed, have their match tables written
 163 # out.  But all the ones which can be used in regular expression \p{} and \P{}
 164 # constructs will.  Prior to 5.14, generally a property would have either its
 165 # map table or its match tables written but not both.  Again, what gets
 166 # written is controlled by lists which can easily be changed.  Starting in
 167 # 5.14, advantage was taken of this, and all the map tables needed to
 168 # reconstruct the Unicode db are now written out, while suppressing the
 169 # Unicode .txt files that contain the data.  Our tables are much more compact
 170 # than the .txt files, so a significant space savings was achieved.
 171
 172 # Properties have a 'Type', like binary, or string, or enum depending on how
 173 # many match tables there are and the content of the maps.  This 'Type' is
 174 # different than a range 'Type', so don't get confused by the two concepts
 175 # having the same name.
 176 #
 177 # For information about the Unicode properties, see Unicode's UAX44 document:
 178
 179 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 180
 181 # As stated earlier, this program will work on any release of Unicode so far.
 182 # Most obvious problems in earlier data have NOT been corrected except when
 183 # necessary to make Perl or this program work reasonably.  For example, no
 184 # folding information was given in early releases, so this program substitutes
 185 # lower case instead, just so that a regular expression with the /i option
 186 # will do something that actually gives the right results in many cases.
 187 # There are also a couple other corrections for version 1.1.5, commented at
 188 # the point they are made.  As an example of corrections that weren't made
 189 # (but could be) is this statement from DerivedAge.txt: "The supplementary
 190 # private use code points and the non-character code points were assigned in
 191 # version 2.0, but not specifically listed in the UCD until versions 3.0 and
 192 # 3.1 respectively."  (To be precise it was 3.0.1 not 3.0.0) More information
 193 # on Unicode version glitches is further down in these introductory comments.
 194 #
 195 # This program works on all non-provisional properties as of 6.0, though the
 196 # files for some are suppressed from apparent lack of demand for them.  You
 197 # can change which are output by changing lists in this program.
 198 #
 199 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
 200 # loose matchings rules (from Unicode TR18):
 201 #
 202 #    The recommended names for UCD properties and property values are in
 203 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 204 #    [PropValue]. There are both abbreviated names and longer, more
 205 #    descriptive names. It is strongly recommended that both names be
 206 #    recognized, and that loose matching of property names be used,
 207 #    whereby the case distinctions, whitespace, hyphens, and underbar
 208 #    are ignored.
 209 # The program still allows Fuzzy to override its determination of if loose
 210 # matching should be used, but it isn't currently used, as it is no longer
 211 # needed; the calculations it makes are good enough.
 212 #
 213 # SUMMARY OF HOW IT WORKS:
 214 #
 215 #   Process arguments
 216 #
 217 #   A list is constructed containing each input file that is to be processed
 218 #
 219 #   Each file on the list is processed in a loop, using the associated handler
 220 #   code for each:
 221 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 222 #            first.  These files name the properties and property values.
 223 #            Objects are created of all the property and property value names
 224 #            that the rest of the input should expect, including all synonyms.
 225 #        The other input files give mappings from properties to property
 226 #           values.  That is, they list code points and say what the mapping
 227 #           is under the given property.  Some files give the mappings for
 228 #           just one property; and some for many.  This program goes through
 229 #           each file and populates the properties from them.  Some properties
 230 #           are listed in more than one file, and Unicode has set up a
 231 #           precedence as to which has priority if there is a conflict.  Thus
 232 #           the order of processing matters, and this program handles the
 233 #           conflict possibility by processing the overriding input files
 234 #           last, so that if necessary they replace earlier values.
 235 #        After this is all done, the program creates the property mappings not
 236 #            furnished by Unicode, but derivable from what it does give.
 237 #        The tables of code points that match each property value in each
 238 #            property that is accessible by regular expressions are created.
 239 #        The Perl-defined properties are created and populated.  Many of these
 240 #            require data determined from the earlier steps
 241 #        Any Perl-defined synonyms are created, and name clashes between Perl
 242 #            and Unicode are reconciled and warned about.
 243 #        All the properties are written to files
 244 #        Any other files are written, and final warnings issued.
 245 #
 246 # For clarity, a number of operators have been overloaded to work on tables:
 247 #   ~ means invert (take all characters not in the set).  The more
 248 #       conventional '!' is not used because of the possibility of confusing
 249 #       it with the actual boolean operation.
 250 #   + means union
 251 #   - means subtraction
 252 #   & means intersection
 253 # The precedence of these is the order listed.  Parentheses should be
 254 # copiously used.  These are not a general scheme.  The operations aren't
 255 # defined for a number of things, deliberately, to avoid getting into trouble.
 256 # Operations are done on references and affect the underlying structures, so
 257 # that the copy constructors for them have been overloaded to not return a new
 258 # clone, but the input object itself.
 259 #
 260 # The bool operator is deliberately not overloaded to avoid confusion with
 261 # "should it mean if the object merely exists, or also is non-empty?".
 262 #
 263 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 264 #
 265 # This program needs to be able to run under miniperl.  Therefore, it uses a
 266 # minimum of other modules, and hence implements some things itself that could
 267 # be gotten from CPAN
 268 #
 269 # This program uses inputs published by the Unicode Consortium.  These can
 270 # change incompatibly between releases without the Perl maintainers realizing
 271 # it.  Therefore this program is now designed to try to flag these.  It looks
 272 # at the directories where the inputs are, and flags any unrecognized files.
 273 # It keeps track of all the properties in the files it handles, and flags any
 274 # that it doesn't know how to handle.  It also flags any input lines that
 275 # don't match the expected syntax, among other checks.
 276 #
 277 # It is also designed so if a new input file matches one of the known
 278 # templates, one hopefully just needs to add it to a list to have it
 279 # processed.
 280 #
 281 # As mentioned earlier, some properties are given in more than one file.  In
 282 # particular, the files in the extracted directory are supposedly just
 283 # reformattings of the others.  But they contain information not easily
 284 # derivable from the other files, including results for Unihan, which this
 285 # program doesn't ordinarily look at, and for unassigned code points.  They
 286 # also have historically had errors or been incomplete.  In an attempt to
 287 # create the best possible data, this program thus processes them first to
 288 # glean information missing from the other files; then processes those other
 289 # files to override any errors in the extracted ones.  Much of the design was
 290 # driven by this need to store things and then possibly override them.
 291 #
 292 # It tries to keep fatal errors to a minimum, to generate something usable for
 293 # testing purposes.  It always looks for files that could be inputs, and will
 294 # warn about any that it doesn't know how to handle (the -q option suppresses
 295 # the warning).
 296 #
 297 # Why is there more than one type of range?
 298 #   This simplified things.  There are some very specialized code points that
 299 #   have to be handled specially for output, such as Hangul syllable names.
 300 #   By creating a range type (done late in the development process), it
 301 #   allowed this to be stored with the range, and overridden by other input.
 302 #   Originally these were stored in another data structure, and it became a
 303 #   mess trying to decide if a second file that was for the same property was
 304 #   overriding the earlier one or not.
 305 #
 306 # Why are there two kinds of tables, match and map?
 307 #   (And there is a base class shared by the two as well.)  As stated above,
 308 #   they actually are for different things.  Development proceeded much more
 309 #   smoothly when I (khw) realized the distinction.  Map tables are used to
 310 #   give the property value for every code point (actually every code point
 311 #   that doesn't map to a default value).  Match tables are used for regular
 312 #   expression matches, and are essentially the inverse mapping.  Separating
 313 #   the two allows more specialized methods, and error checks so that one
 314 #   can't just take the intersection of two map tables, for example, as that
 315 #   is nonsensical.
 316 #
 317 # DEBUGGING
 318 #
 319 # This program is written so it will run under miniperl.  Occasionally changes
 320 # will cause an error where the backtrace doesn't work well under miniperl.
 321 # To diagnose the problem, you can instead run it under regular perl, if you
 322 # have one compiled.
 323 #
 324 # There is a good trace facility.  To enable it, first sub DEBUG must be set
 325 # to return true.  Then a line like
 326 #
 327 # local $to_trace = 1 if main::DEBUG;
 328 #
 329 # can be added to enable tracing in its lexical scope or until you insert
 330 # another line:
 331 #
 332 # local $to_trace = 0 if main::DEBUG;
 333 #
 334 # then use a line like "trace $a, @b, %c, ...;
 335 #
 336 # Some of the more complex subroutines already have trace statements in them.
 337 # Permanent trace statements should be like:
 338 #
 339 # trace ... if main::DEBUG && $to_trace;
 340 #
 341 # If there is just one or a few files that you're debugging, you can easily
 342 # cause most everything else to be skipped.  Change the line
 343 #
 344 # my $debug_skip = 0;
 345 #
 346 # to 1, and every file whose object is in @input_file_objects and doesn't have
 347 # a, 'non_skip => 1,' in its constructor will be skipped.
 348 #
 349 # To compare the output tables, it may be useful to specify the -annotate
 350 # flag.  This causes the tables to expand so there is one entry for each
 351 # non-algorithmically named code point giving, currently its name, and its
 352 # graphic representation if printable (and you have a font that knows about
 353 # it).  This makes it easier to see what the particular code points are in
 354 # each output table.  The tables are usable, but because they don't have
 355 # ranges (for the most part), a Perl using them will run slower.  Non-named
 356 # code points are annotated with a description of their status, and contiguous
 357 # ones with the same description will be output as a range rather than
 358 # individually.  Algorithmically named characters are also output as ranges,
 359 # except when there are just a few contiguous ones.
 360 #
 361 # FUTURE ISSUES
 362 #
 363 # The program would break if Unicode were to change its names so that
 364 # interior white space, underscores, or dashes differences were significant
 365 # within property and property value names.
 366 #
 367 # It might be easier to use the xml versions of the UCD if this program ever
 368 # would need heavy revision, and the ability to handle old versions was not
 369 # required.
 370 #
 371 # There is the potential for name collisions, in that Perl has chosen names
 372 # that Unicode could decide it also likes.  There have been such collisions in
 373 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 374 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 375 # collisions, which were withdrawn before the final release, because of Perl's
 376 # and other's protests.  These all involved new properties which began with
 377 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 378 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 379 # Unicode document, so they are unlikely to be used by Unicode for another
 380 # purpose.  However, they might try something beginning with 'In', or use any
 381 # of the other Perl-defined properties.  This program will warn you of name
 382 # collisions, and refuse to generate tables with them, but manual intervention
 383 # will be required in this event.  One scheme that could be implemented, if
 384 # necessary, would be to have this program generate another file, or add a
 385 # field to mktables.lst that gives the date of first definition of a property.
 386 # Each new release of Unicode would use that file as a basis for the next
 387 # iteration.  And the Perl synonym addition code could sort based on the age
 388 # of the property, so older properties get priority, and newer ones that clash
 389 # would be refused; hence existing code would not be impacted, and some other
 390 # synonym would have to be used for the new property.  This is ugly, and
 391 # manual intervention would certainly be easier to do in the short run; lets
 392 # hope it never comes to this.
 393 #
 394 # A NOTE ON UNIHAN
 395 #
 396 # This program can generate tables from the Unihan database.  But it doesn't
 397 # by default, letting the CPAN module Unicode::Unihan handle them.  Prior to
 398 # version 5.2, this database was in a single file, Unihan.txt.  In 5.2 the
 399 # database was split into 8 different files, all beginning with the letters
 400 # 'Unihan'.  This program will read those file(s) if present, but it needs to
 401 # know which of the many properties in the file(s) should have tables created
 402 # for them.  It will create tables for any properties listed in
 403 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
 404 # @cjk_properties array and the @cjk_property_values array.  Thus, if a
 405 # property you want is not in those files of the release you are building
 406 # against, you must add it to those two arrays.  Starting in 4.0, the
 407 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
 408 # is present in the directory, a table will be generated for that property.
 409 # In 5.2, several more properties were added.  For your convenience, the two
 410 # arrays are initialized with all the 6.0 listed properties that are also in
 411 # earlier releases.  But these are commented out.  You can just uncomment the
 412 # ones you want, or use them as a template for adding entries for other
 413 # properties.
 414 #
 415 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 416 # and filter_unihan_line() are the functions where this is done.  This program
 417 # already does some adjusting to make the lines look more like the rest of the
 418 # Unicode DB;  You can see what that is in filter_unihan_line()
 419 #
 420 # There is a bug in the 3.2 data file in which some values for the
 421 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 422 # could be added for these; or for a particular installation, the Unihan.txt
 423 # file could be edited to fix them.
 424 #
 425 # HOW TO ADD A FILE TO BE PROCESSED
 426 #
 427 # A new file from Unicode needs to have an object constructed for it in
 428 # @input_file_objects, probably at the end or at the end of the extracted
 429 # ones.  The program should warn you if its name will clash with others on
 430 # restrictive file systems, like DOS.  If so, figure out a better name, and
 431 # add lines to the README.perl file giving that.  If the file is a character
 432 # property, it should be in the format that Unicode has by default
 433 # standardized for such files for the more recently introduced ones.
 434 # If so, the Input_file constructor for @input_file_objects can just be the
 435 # file name and release it first appeared in.  If not, then it should be
 436 # possible to construct an each_line_handler() to massage the line into the
 437 # standardized form.
 438 #
 439 # For non-character properties, more code will be needed.  You can look at
 440 # the existing entries for clues.
 441 #
 442 # UNICODE VERSIONS NOTES
 443 #
 444 # The Unicode UCD has had a number of errors in it over the versions.  And
 445 # these remain, by policy, in the standard for that version.  Therefore it is
 446 # risky to correct them, because code may be expecting the error.  So this
 447 # program doesn't generally make changes, unless the error breaks the Perl
 448 # core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
 449 # for U+1105, which causes real problems for the algorithms for Jamo
 450 # calculations, so it is changed here.
 451 #
 452 # But it isn't so clear cut as to what to do about concepts that are
 453 # introduced in a later release; should they extend back to earlier releases
 454 # where the concept just didn't exist?  It was easier to do this than to not,
 455 # so that's what was done.  For example, the default value for code points not
 456 # in the files for various properties was probably undefined until changed by
 457 # some version.  No_Block for blocks is such an example.  This program will
 458 # assign No_Block even in Unicode versions that didn't have it.  This has the
 459 # benefit that code being written doesn't have to special case earlier
 460 # versions; and the detriment that it doesn't match the Standard precisely for
 461 # the affected versions.
 462 #
 463 # Here are some observations about some of the issues in early versions:
 464 #
 465 # The number of code points in \p{alpha} halved in 2.1.9.  It turns out that
 466 # the reason is that the CJK block starting at 4E00 was removed from PropList,
 467 # and was not put back in until 3.1.0
 468 #
 469 # Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
 470 # always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
 471 # reason is that 3.2 introduced U+205F=medium math space, which was not
 472 # classed as white space, but Perl figured out that it should have been. 4.0
 473 # reclassified it correctly.
 474 #
 475 # Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
 476 # this was erroneously a synonym for 202.  In 4.0, ATB became 202, and ATBL
 477 # was left with no code points, as all the ones that mapped to 202 stayed
 478 # mapped to 202.  Thus if your program used the numeric name for the class,
 479 # it would not have been affected, but if it used the mnemonic, it would have
 480 # been.
 481 #
 482 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that code
 483 # points which eventually came to have this script property value, instead
 484 # mapped to "Unknown".  But in the next release all these code points were
 485 # moved to \p{sc=common} instead.
 486 #
 487 # The default for missing code points for BidiClass is complicated.  Starting
 488 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 489 # tries to do the best it can for earlier releases.  It is done in
 490 # process_PropertyAliases()
 491 #
 492 ##############################################################################
 493
 494 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 495                         # and errors
 496 my $MAX_LINE_WIDTH = 78;
 497
 498 # Debugging aid to skip most files so as to not be distracted by them when
 499 # concentrating on the ones being debugged.  Add
 500 # non_skip => 1,
 501 # to the constructor for those files you want processed when you set this.
 502 # Files with a first version number of 0 are special: they are always
 503 # processed regardless of the state of this flag.  Generally, Jamo.txt and
 504 # UnicodeData.txt must not be skipped if you want this program to not die
 505 # before normal completion.
 506 my $debug_skip = 0;
 507
 508 # Set to 1 to enable tracing.
 509 our $to_trace = 0;
 510
 511 { # Closure for trace: debugging aid
 512     my $print_caller = 1;        # ? Include calling subroutine name
 513     my $main_with_colon = 'main::';
 514     my $main_colon_length = length($main_with_colon);
 515
 516     sub trace {
 517         return unless $to_trace;        # Do nothing if global flag not set
 518
 519         my @input = @_;
 520
 521         local $DB::trace = 0;
 522         $DB::trace = 0;          # Quiet 'used only once' message
 523
 524         my $line_number;
 525
 526         # Loop looking up the stack to get the first non-trace caller
 527         my $caller_line;
 528         my $caller_name;
 529         my $i = 0;
 530         do {
 531             $line_number = $caller_line;
 532             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 533             $caller = $main_with_colon unless defined $caller;
 534
 535             $caller_name = $caller;
 536
 537             # get rid of pkg
 538             $caller_name =~ s/.*:://;
 539             if (substr($caller_name, 0, $main_colon_length)
 540                 eq $main_with_colon)
 541             {
 542                 $caller_name = substr($caller_name, $main_colon_length);
 543             }
 544
 545         } until ($caller_name ne 'trace');
 546
 547         # If the stack was empty, we were called from the top level
 548         $caller_name = 'main' if ($caller_name eq ""
 549                                     || $caller_name eq 'trace');
 550
 551         my $output = "";
 552         foreach my $string (@input) {
 553             #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 554             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 555                 $output .= simple_dumper($string);
 556             }
 557             else {
 558                 $string = "$string" if ref $string;
 559                 $string = $UNDEF unless defined $string;
 560                 chomp $string;
 561                 $string = '""' if $string eq "";
 562                 $output .= " " if $output ne ""
 563                                 && $string ne ""
 564                                 && substr($output, -1, 1) ne " "
 565                                 && substr($string, 0, 1) ne " ";
 566                 $output .= $string;
 567             }
 568         }
 569
 570         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 571         print STDERR "$caller_name: " if $print_caller;
 572         print STDERR $output, "\n";
 573         return;
 574     }
 575 }
 576
 577 # This is for a rarely used development feature that allows you to compare two
 578 # versions of the Unicode standard without having to deal with changes caused
 579 # by the code points introduced in the later version.  Change the 0 to a
 580 # string containing a SINGLE dotted Unicode release number (e.g. "2.1").  Only
 581 # code points introduced in that release and earlier will be used; later ones
 582 # are thrown away.  You use the version number of the earliest one you want to
 583 # compare; then run this program on directory structures containing each
 584 # release, and compare the outputs.  These outputs will therefore include only
 585 # the code points common to both releases, and you can see the changes caused
 586 # just by the underlying release semantic changes.  For versions earlier than
 587 # 3.2, you must copy a version of DAge.txt into the directory.
 588 my $string_compare_versions = DEBUG && 0; #  e.g., "2.1";
 589 my $compare_versions = DEBUG
 590                        && $string_compare_versions
 591                        && pack "C*", split /\./, $string_compare_versions;
 592
 593 sub uniques {
 594     # Returns non-duplicated input values.  From "Perl Best Practices:
 595     # Encapsulated Cleverness".  p. 455 in first edition.
 596
 597     my %seen;
 598     # Arguably this breaks encapsulation, if the goal is to permit multiple
 599     # distinct objects to stringify to the same value, and be interchangeable.
 600     # However, for this program, no two objects stringify identically, and all
 601     # lists passed to this function are either objects or strings. So this
 602     # doesn't affect correctness, but it does give a couple of percent speedup.
 603     no overloading;
 604     return grep { ! $seen{$_}++ } @_;
 605 }
 606
 607 $0 = File::Spec->canonpath($0);
 608
 609 my $make_test_script = 0;      # ? Should we output a test script
 610 my $write_unchanged_files = 0; # ? Should we update the output files even if
 611                                #    we don't think they have changed
 612 my $use_directory = "";        # ? Should we chdir somewhere.
 613 my $pod_directory;             # input directory to store the pod file.
 614 my $pod_file = 'perluniprops';
 615 my $t_path;                     # Path to the .t test file
 616 my $file_list = 'mktables.lst'; # File to store input and output file names.
 617                                # This is used to speed up the build, by not
 618                                # executing the main body of the program if
 619                                # nothing on the list has changed since the
 620                                # previous build
 621 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 622                                # make a list so that when the pumpking is
 623                                # preparing a release, s/he won't have to do
 624                                # special things
 625 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 626                                # in the input.
 627 my $output_range_counts = $debugging_build;   # ? Should we include the number
 628                                               # of code points in ranges in
 629                                               # the output
 630 my $annotate = 0;              # ? Should character names be in the output
 631
 632 # Verbosity levels; 0 is quiet
 633 my $NORMAL_VERBOSITY = 1;
 634 my $PROGRESS = 2;
 635 my $VERBOSE = 3;
 636
 637 my $verbosity = $NORMAL_VERBOSITY;
 638
 639 # Process arguments
 640 while (@ARGV) {
 641     my $arg = shift @ARGV;
 642     if ($arg eq '-v') {
 643         $verbosity = $VERBOSE;
 644     }
 645     elsif ($arg eq '-p') {
 646         $verbosity = $PROGRESS;
 647         $| = 1;     # Flush buffers as we go.
 648     }
 649     elsif ($arg eq '-q') {
 650         $verbosity = 0;
 651     }
 652     elsif ($arg eq '-w') {
 653         $write_unchanged_files = 1; # update the files even if havent changed
 654     }
 655     elsif ($arg eq '-check') {
 656         my $this = shift @ARGV;
 657         my $ok = shift @ARGV;
 658         if ($this ne $ok) {
 659             print "Skipping as check params are not the same.\n";
 660             exit(0);
 661         }
 662     }
 663     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 664         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 665     }
 666     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 667     {
 668         $make_test_script = 1;
 669     }
 670     elsif ($arg eq '-makelist') {
 671         $make_list = 1;
 672     }
 673     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 674         -d $use_directory or croak "Unknown directory '$use_directory'";
 675     }
 676     elsif ($arg eq '-L') {
 677
 678         # Existence not tested until have chdir'd
 679         $file_list = shift;
 680     }
 681     elsif ($arg eq '-globlist') {
 682         $glob_list = 1;
 683     }
 684     elsif ($arg eq '-c') {
 685         $output_range_counts = ! $output_range_counts
 686     }
 687     elsif ($arg eq '-annotate') {
 688         $annotate = 1;
 689         $debugging_build = 1;
 690         $output_range_counts = 1;
 691     }
 692     else {
 693         my $with_c = 'with';
 694         $with_c .= 'out' if $output_range_counts;   # Complements the state
 695         croak <<END;
 696 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 697           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 698           [-check A B ]
 699   -c          : Output comments $with_c number of code points in ranges
 700   -q          : Quiet Mode: Only output serious warnings.
 701   -p          : Set verbosity level to normal plus show progress.
 702   -v          : Set Verbosity level high:  Show progress and non-serious
 703                 warnings
 704   -w          : Write files regardless
 705   -C dir      : Change to this directory before proceeding. All relative paths
 706                 except those specified by the -P and -T options will be done
 707                 with respect to this directory.
 708   -P dir      : Output $pod_file file to directory 'dir'.
 709   -T path     : Create a test script as 'path'; overrides -maketest
 710   -L filelist : Use alternate 'filelist' instead of standard one
 711   -globlist   : Take as input all non-Test *.txt files in current and sub
 712                 directories
 713   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 714                 overrides -T
 715   -makelist   : Rewrite the file list $file_list based on current setup
 716   -annotate   : Output an annotation for each character in the table files;
 717                 useful for debugging mktables, looking at diffs; but is slow,
 718                 memory intensive; resulting tables are usable but slow and
 719                 very large.
 720   -check A B  : Executes $0 only if A and B are the same
 721 END
 722     }
 723 }
 724
 725 # Stores the most-recently changed file.  If none have changed, can skip the
 726 # build
 727 my $most_recent = (stat $0)[9];   # Do this before the chdir!
 728
 729 # Change directories now, because need to read 'version' early.
 730 if ($use_directory) {
 731     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 732         $pod_directory = File::Spec->rel2abs($pod_directory);
 733     }
 734     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 735         $t_path = File::Spec->rel2abs($t_path);
 736     }
 737     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 738     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 739         $pod_directory = File::Spec->abs2rel($pod_directory);
 740     }
 741     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 742         $t_path = File::Spec->abs2rel($t_path);
 743     }
 744 }
 745
 746 # Get Unicode version into regular and v-string.  This is done now because
 747 # various tables below get populated based on it.  These tables are populated
 748 # here to be near the top of the file, and so easily seeable by those needing
 749 # to modify things.
 750 open my $VERSION, "<", "version"
 751                     or croak "$0: can't open required file 'version': $!\n";
 752 my $string_version = <$VERSION>;
 753 close $VERSION;
 754 chomp $string_version;
 755 my $v_version = pack "C*", split /\./, $string_version;        # v string
 756
 757 # The following are the complete names of properties with property values that
 758 # are known to not match any code points in some versions of Unicode, but that
 759 # may change in the future so they should be matchable, hence an empty file is
 760 # generated for them.
 761 my @tables_that_may_be_empty = (
 762                                 'Joining_Type=Left_Joining',
 763                                 );
 764 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 765 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 766 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 767                                                     if $v_version ge v4.1.0;
 768 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
 769                                                     if $v_version ge v6.0.0;
 770
 771 # The lists below are hashes, so the key is the item in the list, and the
 772 # value is the reason why it is in the list.  This makes generation of
 773 # documentation easier.
 774
 775 my %why_suppressed;  # No file generated for these.
 776
 777 # Files aren't generated for empty extraneous properties.  This is arguable.
 778 # Extraneous properties generally come about because a property is no longer
 779 # used in a newer version of Unicode.  If we generated a file without code
 780 # points, programs that used to work on that property will still execute
 781 # without errors.  It just won't ever match (or will always match, with \P{}).
 782 # This means that the logic is now likely wrong.  I (khw) think its better to
 783 # find this out by getting an error message.  Just move them to the table
 784 # above to change this behavior
 785 my %why_suppress_if_empty_warn_if_not = (
 786
 787    # It is the only property that has ever officially been removed from the
 788    # Standard.  The database never contained any code points for it.
 789    'Special_Case_Condition' => 'Obsolete',
 790
 791    # Apparently never official, but there were code points in some versions of
 792    # old-style PropList.txt
 793    'Non_Break' => 'Obsolete',
 794 );
 795
 796 # These would normally go in the warn table just above, but they were changed
 797 # a long time before this program was written, so warnings about them are
 798 # moot.
 799 if ($v_version gt v3.2.0) {
 800     push @tables_that_may_be_empty,
 801                                 'Canonical_Combining_Class=Attached_Below_Left'
 802 }
 803
 804 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
 805 # unless explicitly added.
 806 if ($v_version ge v5.2.0) {
 807     my $unihan = 'Unihan; remove from list if using Unihan';
 808     foreach my $table (qw (
 809                            kAccountingNumeric
 810                            kOtherNumeric
 811                            kPrimaryNumeric
 812                            kCompatibilityVariant
 813                            kIICore
 814                            kIRG_GSource
 815                            kIRG_HSource
 816                            kIRG_JSource
 817                            kIRG_KPSource
 818                            kIRG_MSource
 819                            kIRG_KSource
 820                            kIRG_TSource
 821                            kIRG_USource
 822                            kIRG_VSource
 823                            kRSUnicode
 824                         ))
 825     {
 826         $why_suppress_if_empty_warn_if_not{$table} = $unihan;
 827     }
 828 }
 829
 830 # Enum values for to_output_map() method in the Map_Table package.
 831 my $EXTERNAL_MAP = 1;
 832 my $INTERNAL_MAP = 2;
 833
 834 # To override computed values for writing the map tables for these properties.
 835 # The default for enum map tables is to write them out, so that the Unicode
 836 # .txt files can be removed, but all the data to compute any property value
 837 # for any code point is available in a more compact form.
 838 my %global_to_output_map = (
 839     # Needed by UCD.pm, but don't want to publicize that it exists, so won't
 840     # get stuck supporting it if things change.  Since it is a STRING
 841     # property, it normally would be listed in the pod, but INTERNAL_MAP
 842     # suppresses that.
 843     Unicode_1_Name => $INTERNAL_MAP,
 844
 845     Present_In => 0,                # Suppress, as easily computed from Age
 846     Block => 0,                     # Suppress, as Blocks.txt is retained.
 847 );
 848
 849 # Properties that this program ignores.
 850 my @unimplemented_properties = (
 851 'Unicode_Radical_Stroke'    # Remove if changing to handle this one.
 852 );
 853
 854 # There are several types of obsolete properties defined by Unicode.  These
 855 # must be hand-edited for every new Unicode release.
 856 my %why_deprecated;  # Generates a deprecated warning message if used.
 857 my %why_stabilized;  # Documentation only
 858 my %why_obsolete;    # Documentation only
 859
 860 {   # Closure
 861     my $simple = 'Perl uses the more complete version of this property';
 862     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 863
 864     my $other_properties = 'other properties';
 865     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 866     my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
 867
 868     %why_deprecated = (
 869         'Grapheme_Link' => 'Deprecated by Unicode:  Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
 870         'Jamo_Short_Name' => $contributory,
 871         'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 872         'Other_Alphabetic' => $contributory,
 873         'Other_Default_Ignorable_Code_Point' => $contributory,
 874         'Other_Grapheme_Extend' => $contributory,
 875         'Other_ID_Continue' => $contributory,
 876         'Other_ID_Start' => $contributory,
 877         'Other_Lowercase' => $contributory,
 878         'Other_Math' => $contributory,
 879         'Other_Uppercase' => $contributory,
 880     );
 881
 882     %why_suppressed = (
 883         # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
 884         # contains the same information, but without the algorithmically
 885         # determinable Hangul syllables'.  This file is not published, so it's
 886         # existence is not noted in the comment.
 887         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
 888
 889         'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo.  Obsoleted, and code points for it removed in Unicode 5.2',
 890
 891         'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold",
 892         'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 893         'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 894         'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo",
 895
 896         'Name' => "Accessible via 'use charnames;'",
 897         'Name_Alias' => "Accessible via 'use charnames;'",
 898
 899         FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
 900         Expands_On_NFC => $why_no_expand,
 901         Expands_On_NFD => $why_no_expand,
 902         Expands_On_NFKC => $why_no_expand,
 903         Expands_On_NFKD => $why_no_expand,
 904     );
 905
 906     # The following are suppressed because they were made contributory or
 907     # deprecated by Unicode before Perl ever thought about supporting them.
 908     foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
 909         $why_suppressed{$property} = $why_deprecated{$property};
 910     }
 911
 912     # Customize the message for all the 'Other_' properties
 913     foreach my $property (keys %why_deprecated) {
 914         next if (my $main_property = $property) !~ s/^Other_//;
 915         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
 916     }
 917 }
 918
 919 if ($v_version ge 4.0.0) {
 920     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
 921     if ($v_version ge 6.0.0) {
 922         $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
 923     }
 924 }
 925 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
 926     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
 927     if ($v_version ge 6.0.0) {
 928         $why_deprecated{'ISO_Comment'} = 'No longer needed for chart generation; otherwise not useful, and code points for it have been removed';
 929     }
 930 }
 931
 932 # Probably obsolete forever
 933 if ($v_version ge v4.1.0) {
 934     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
 935 }
 936 if ($v_version ge v6.0.0) {
 937     $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
 938     $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
 939 }
 940
 941 # This program can create files for enumerated-like properties, such as
 942 # 'Numeric_Type'.  This file would be the same format as for a string
 943 # property, with a mapping from code point to its value, so you could look up,
 944 # for example, the script a code point is in.  But no one so far wants this
 945 # mapping, or they have found another way to get it since this is a new
 946 # feature.  So no file is generated except if it is in this list.
 947 my @output_mapped_properties = split "\n", <<END;
 948 END
 949
 950 # If you are using the Unihan database in a Unicode version before 5.2, you
 951 # need to add the properties that you want to extract from it to this table.
 952 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
 953 # listed, commented out
 954 my @cjk_properties = split "\n", <<'END';
 955 #cjkAccountingNumeric; kAccountingNumeric
 956 #cjkOtherNumeric; kOtherNumeric
 957 #cjkPrimaryNumeric; kPrimaryNumeric
 958 #cjkCompatibilityVariant; kCompatibilityVariant
 959 #cjkIICore ; kIICore
 960 #cjkIRG_GSource; kIRG_GSource
 961 #cjkIRG_HSource; kIRG_HSource
 962 #cjkIRG_JSource; kIRG_JSource
 963 #cjkIRG_KPSource; kIRG_KPSource
 964 #cjkIRG_KSource; kIRG_KSource
 965 #cjkIRG_TSource; kIRG_TSource
 966 #cjkIRG_USource; kIRG_USource
 967 #cjkIRG_VSource; kIRG_VSource
 968 #cjkRSUnicode; kRSUnicode                ; Unicode_Radical_Stroke; URS
 969 END
 970
 971 # Similarly for the property values.  For your convenience, the lines in the
 972 # 6.0 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
 973 # '#' marks (for Unicode versions before 5.2)
 974 my @cjk_property_values = split "\n", <<'END';
 975 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
 976 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
 977 ## @missing: 0000..10FFFF; cjkIICore; <none>
 978 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
 979 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
 980 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
 981 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
 982 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
 983 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
 984 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
 985 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
 986 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
 987 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
 988 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
 989 END
 990
 991 # The input files don't list every code point.  Those not listed are to be
 992 # defaulted to some value.  Below are hard-coded what those values are for
 993 # non-binary properties as of 5.1.  Starting in 5.0, there are
 994 # machine-parsable comment lines in the files the give the defaults; so this
 995 # list shouldn't have to be extended.  The claim is that all missing entries
 996 # for binary properties will default to 'N'.  Unicode tried to change that in
 997 # 5.2, but the beta period produced enough protest that they backed off.
 998 #
 999 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1000 # be in the form that it expects.  The others may be synonyms.
1001 my $CODE_POINT = '<code point>';
1002 my %default_mapping = (
1003     Age => "Unassigned",
1004     # Bidi_Class => Complicated; set in code
1005     Bidi_Mirroring_Glyph => "",
1006     Block => 'No_Block',
1007     Canonical_Combining_Class => 0,
1008     Case_Folding => $CODE_POINT,
1009     Decomposition_Mapping => $CODE_POINT,
1010     Decomposition_Type => 'None',
1011     East_Asian_Width => "Neutral",
1012     FC_NFKC_Closure => $CODE_POINT,
1013     General_Category => 'Cn',
1014     Grapheme_Cluster_Break => 'Other',
1015     Hangul_Syllable_Type => 'NA',
1016     ISO_Comment => "",
1017     Jamo_Short_Name => "",
1018     Joining_Group => "No_Joining_Group",
1019     # Joining_Type => Complicated; set in code
1020     kIICore => 'N',   #                       Is converted to binary
1021     #Line_Break => Complicated; set in code
1022     Lowercase_Mapping => $CODE_POINT,
1023     Name => "",
1024     Name_Alias => "",
1025     NFC_QC => 'Yes',
1026     NFD_QC => 'Yes',
1027     NFKC_QC => 'Yes',
1028     NFKD_QC => 'Yes',
1029     Numeric_Type => 'None',
1030     Numeric_Value => 'NaN',
1031     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1032     Sentence_Break => 'Other',
1033     Simple_Case_Folding => $CODE_POINT,
1034     Simple_Lowercase_Mapping => $CODE_POINT,
1035     Simple_Titlecase_Mapping => $CODE_POINT,
1036     Simple_Uppercase_Mapping => $CODE_POINT,
1037     Titlecase_Mapping => $CODE_POINT,
1038     Unicode_1_Name => "",
1039     Unicode_Radical_Stroke => "",
1040     Uppercase_Mapping => $CODE_POINT,
1041     Word_Break => 'Other',
1042 );
1043
1044 # Below are files that Unicode furnishes, but this program ignores, and why
1045 my %ignored_files = (
1046     'CJKRadicals.txt' => 'Unihan data',
1047     'Index.txt' => 'An index, not actual data',
1048     'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
1049     'NamesList.txt' => 'Just adds commentary',
1050     'NormalizationCorrections.txt' => 'Data is already in other files.',
1051     'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
1052     'ReadMe.txt' => 'Just comments',
1053     'README.TXT' => 'Just comments',
1054     'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property.  Does not fit into current scheme where one code point is mapped',
1055     'EmojiSources.txt' => 'Not of general utility: for Japanese legacy cell-phone applications',
1056     'IndicMatraCategory.txt' => 'Provisional',
1057     'IndicSyllabicCategory.txt' => 'Provisional',
1058 );
1059
1060 ### End of externally interesting definitions, except for @input_file_objects
1061
1062 my $HEADER=<<"EOF";
1063 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1064 # This file is machine-generated by $0 from the Unicode
1065 # database, Version $string_version.  Any changes made here will be lost!
1066 EOF
1067
1068 my $INTERNAL_ONLY=<<"EOF";
1069
1070 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1071 # This file is for internal use by core Perl only.  The format and even the
1072 # name or existence of this file are subject to change without notice.  Don't
1073 # use it directly.
1074 EOF
1075
1076 my $DEVELOPMENT_ONLY=<<"EOF";
1077 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1078 # This file contains information artificially constrained to code points
1079 # present in Unicode release $string_compare_versions.
1080 # IT CANNOT BE RELIED ON.  It is for use during development only and should
1081 # not be used for production.
1082
1083 EOF
1084
1085 my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
1086 my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
1087 my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
1088
1089 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
1090 # two must be 10; if there are 5, the first must not be a 0.  Written this way
1091 # to decrease backtracking.  The first one allows the code point to be at the
1092 # end of a word, but to work properly, the word shouldn't end with a valid hex
1093 # character.  The second one won't match a code point at the end of a word,
1094 # and doesn't have the run-on issue
1095 my $run_on_code_point_re =
1096             qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1097 my $code_point_re = qr/\b$run_on_code_point_re/;
1098
1099 # This matches the beginning of the line in the Unicode db files that give the
1100 # defaults for code points not listed (i.e., missing) in the file.  The code
1101 # depends on this ending with a semi-colon, so it can assume it is a valid
1102 # field when the line is split() by semi-colons
1103 my $missing_defaults_prefix =
1104             qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
1105
1106 # Property types.  Unicode has more types, but these are sufficient for our
1107 # purposes.
1108 my $UNKNOWN = -1;   # initialized to illegal value
1109 my $NON_STRING = 1; # Either binary or enum
1110 my $BINARY = 2;
1111 my $ENUM = 3;       # Include catalog
1112 my $STRING = 4;     # Anything else: string or misc
1113
1114 # Some input files have lines that give default values for code points not
1115 # contained in the file.  Sometimes these should be ignored.
1116 my $NO_DEFAULTS = 0;        # Must evaluate to false
1117 my $NOT_IGNORED = 1;
1118 my $IGNORED = 2;
1119
1120 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
1121 # and will appear in the main body of the tables in the output files, but
1122 # there are other types of ranges as well, listed below, that are specially
1123 # handled.   There are pseudo-types as well that will never be stored as a
1124 # type, but will affect the calculation of the type.
1125
1126 # 0 is for normal, non-specials
1127 my $MULTI_CP = 1;           # Sequence of more than code point
1128 my $HANGUL_SYLLABLE = 2;
1129 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1130 my $NULL = 4;               # The map is to the null string; utf8.c can't
1131                             # handle these, nor is there an accepted syntax
1132                             # for them in \p{} constructs
1133 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1134                              # otherwise be $MULTI_CP type are instead type 0
1135
1136 # process_generic_property_file() can accept certain overrides in its input.
1137 # Each of these must begin AND end with $CMD_DELIM.
1138 my $CMD_DELIM = "\a";
1139 my $REPLACE_CMD = 'replace';    # Override the Replace
1140 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1141
1142 my $NO = 0;
1143 my $YES = 1;
1144
1145 # Values for the Replace argument to add_range.
1146 # $NO                      # Don't replace; add only the code points not
1147                            # already present.
1148 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1149                            # the comments at the subroutine definition.
1150 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1151 my $MULTIPLE = 4;          # Don't replace, but add a duplicate record if
1152                            # already there
1153 my $CROAK = 5;             # Die with an error if is already there
1154
1155 # Flags to give property statuses.  The phrases are to remind maintainers that
1156 # if the flag is changed, the indefinite article referring to it in the
1157 # documentation may need to be as well.
1158 my $NORMAL = "";
1159 my $SUPPRESSED = 'z';   # The character should never actually be seen, since
1160                         # it is suppressed
1161 my $PLACEHOLDER = 'P';  # A property that is defined as a placeholder in a
1162                         # Unicode version that doesn't have it, but we need it
1163                         # to be defined, if empty, to have things work.
1164                         # Implies no pod entry generated
1165 my $DEPRECATED = 'D';
1166 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1167 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1168 my $DISCOURAGED = 'X';
1169 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1170 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1171 my $STRICTER = 'T';
1172 my $a_bold_stricter = "a 'B<$STRICTER>'";
1173 my $A_bold_stricter = "A 'B<$STRICTER>'";
1174 my $STABILIZED = 'S';
1175 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1176 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1177 my $OBSOLETE = 'O';
1178 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1179 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1180
1181 my %status_past_participles = (
1182     $DISCOURAGED => 'discouraged',
1183     $SUPPRESSED => 'should never be generated',
1184     $STABILIZED => 'stabilized',
1185     $OBSOLETE => 'obsolete',
1186     $DEPRECATED => 'deprecated',
1187 );
1188
1189 # The format of the values of the tables:
1190 my $EMPTY_FORMAT = "";
1191 my $BINARY_FORMAT = 'b';
1192 my $DECIMAL_FORMAT = 'd';
1193 my $FLOAT_FORMAT = 'f';
1194 my $INTEGER_FORMAT = 'i';
1195 my $HEX_FORMAT = 'x';
1196 my $RATIONAL_FORMAT = 'r';
1197 my $STRING_FORMAT = 's';
1198 my $DECOMP_STRING_FORMAT = 'c';
1199
1200 my %map_table_formats = (
1201     $BINARY_FORMAT => 'binary',
1202     $DECIMAL_FORMAT => 'single decimal digit',
1203     $FLOAT_FORMAT => 'floating point number',
1204     $INTEGER_FORMAT => 'integer',
1205     $HEX_FORMAT => 'non-negative hex whole number; a code point',
1206     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1207     $STRING_FORMAT => 'string',
1208     $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1209 );
1210
1211 # Unicode didn't put such derived files in a separate directory at first.
1212 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1213 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1214 my $AUXILIARY = 'auxiliary';
1215
1216 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1217 my %loose_to_file_of;       # loosely maps table names to their respective
1218                             # files
1219 my %stricter_to_file_of;    # same; but for stricter mapping.
1220 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1221                              # their rational equivalent
1222 my %loose_property_name_of; # Loosely maps (non_string) property names to
1223                             # standard form
1224
1225 # Most properties are immune to caseless matching, otherwise you would get
1226 # nonsensical results, as properties are a function of a code point, not
1227 # everything that is caselessly equivalent to that code point.  For example,
1228 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1229 # be true because 's' and 'S' are equivalent caselessly.  However,
1230 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1231 # extend that concept to those very few properties that are like this.  Each
1232 # such property will match the full range caselessly.  They are hard-coded in
1233 # the program; it's not worth trying to make it general as it's extremely
1234 # unlikely that they will ever change.
1235 my %caseless_equivalent_to;
1236
1237 # These constants names and values were taken from the Unicode standard,
1238 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1239 # syllables.  The '_string' versions are so generated tables can retain the
1240 # hex format, which is the more familiar value
1241 my $SBase_string = "0xAC00";
1242 my $SBase = CORE::hex $SBase_string;
1243 my $LBase_string = "0x1100";
1244 my $LBase = CORE::hex $LBase_string;
1245 my $VBase_string = "0x1161";
1246 my $VBase = CORE::hex $VBase_string;
1247 my $TBase_string = "0x11A7";
1248 my $TBase = CORE::hex $TBase_string;
1249 my $SCount = 11172;
1250 my $LCount = 19;
1251 my $VCount = 21;
1252 my $TCount = 28;
1253 my $NCount = $VCount * $TCount;
1254
1255 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1256 # with the above published constants.
1257 my %Jamo;
1258 my %Jamo_L;     # Leading consonants
1259 my %Jamo_V;     # Vowels
1260 my %Jamo_T;     # Trailing consonants
1261
1262 my @backslash_X_tests;     # List of tests read in for testing \X
1263 my @unhandled_properties;  # Will contain a list of properties found in
1264                            # the input that we didn't process.
1265 my @match_properties;      # Properties that have match tables, to be
1266                            # listed in the pod
1267 my @map_properties;        # Properties that get map files written
1268 my @named_sequences;       # NamedSequences.txt contents.
1269 my %potential_files;       # Generated list of all .txt files in the directory
1270                            # structure so we can warn if something is being
1271                            # ignored.
1272 my @files_actually_output; # List of files we generated.
1273 my @more_Names;            # Some code point names are compound; this is used
1274                            # to store the extra components of them.
1275 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1276                            # the minimum before we consider it equivalent to a
1277                            # candidate rational
1278 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1279
1280 # These store references to certain commonly used property objects
1281 my $gc;
1282 my $perl;
1283 my $block;
1284 my $perl_charname;
1285 my $print;
1286 my $Any;
1287
1288 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1289 my $has_In_conflicts = 0;
1290 my $has_Is_conflicts = 0;
1291
1292 sub internal_file_to_platform ($) {
1293     # Convert our file paths which have '/' separators to those of the
1294     # platform.
1295
1296     my $file = shift;
1297     return undef unless defined $file;
1298
1299     return File::Spec->join(split '/', $file);
1300 }
1301
1302 sub file_exists ($) {   # platform independent '-e'.  This program internally
1303                         # uses slash as a path separator.
1304     my $file = shift;
1305     return 0 if ! defined $file;
1306     return -e internal_file_to_platform($file);
1307 }
1308
1309 sub objaddr($) {
1310     # Returns the address of the blessed input object.
1311     # It doesn't check for blessedness because that would do a string eval
1312     # every call, and the program is structured so that this is never called
1313     # for a non-blessed object.
1314
1315     no overloading; # If overloaded, numifying below won't work.
1316
1317     # Numifying a ref gives its address.
1318     return pack 'J', $_[0];
1319 }
1320
1321 # These are used only if $annotate is true.
1322 # The entire range of Unicode characters is examined to populate these
1323 # after all the input has been processed.  But most can be skipped, as they
1324 # have the same descriptive phrases, such as being unassigned
1325 my @viacode;            # Contains the 1 million character names
1326 my @printable;          # boolean: And are those characters printable?
1327 my @annotate_char_type; # Contains a type of those characters, specifically
1328                         # for the purposes of annotation.
1329 my $annotate_ranges;    # A map of ranges of code points that have the same
1330                         # name for the purposes of annotation.  They map to the
1331                         # upper edge of the range, so that the end point can
1332                         # be immediately found.  This is used to skip ahead to
1333                         # the end of a range, and avoid processing each
1334                         # individual code point in it.
1335 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1336                                    # characters, but excluding those which are
1337                                    # also noncharacter code points
1338
1339 # The annotation types are an extension of the regular range types, though
1340 # some of the latter are folded into one.  Make the new types negative to
1341 # avoid conflicting with the regular types
1342 my $SURROGATE_TYPE = -1;
1343 my $UNASSIGNED_TYPE = -2;
1344 my $PRIVATE_USE_TYPE = -3;
1345 my $NONCHARACTER_TYPE = -4;
1346 my $CONTROL_TYPE = -5;
1347 my $UNKNOWN_TYPE = -6;  # Used only if there is a bug in this program
1348
1349 sub populate_char_info ($) {
1350     # Used only with the $annotate option.  Populates the arrays with the
1351     # input code point's info that are needed for outputting more detailed
1352     # comments.  If calling context wants a return, it is the end point of
1353     # any contiguous range of characters that share essentially the same info
1354
1355     my $i = shift;
1356     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1357
1358     $viacode[$i] = $perl_charname->value_of($i) || "";
1359
1360     # A character is generally printable if Unicode says it is,
1361     # but below we make sure that most Unicode general category 'C' types
1362     # aren't.
1363     $printable[$i] = $print->contains($i);
1364
1365     $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1366
1367     # Only these two regular types are treated specially for annotations
1368     # purposes
1369     $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1370                                 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1371
1372     # Give a generic name to all code points that don't have a real name.
1373     # We output ranges, if applicable, for these.  Also calculate the end
1374     # point of the range.
1375     my $end;
1376     if (! $viacode[$i]) {
1377         if ($gc-> table('Surrogate')->contains($i)) {
1378             $viacode[$i] = 'Surrogate';
1379             $annotate_char_type[$i] = $SURROGATE_TYPE;
1380             $printable[$i] = 0;
1381             $end = $gc->table('Surrogate')->containing_range($i)->end;
1382         }
1383         elsif ($gc-> table('Private_use')->contains($i)) {
1384             $viacode[$i] = 'Private Use';
1385             $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1386             $printable[$i] = 0;
1387             $end = $gc->table('Private_Use')->containing_range($i)->end;
1388         }
1389         elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1390                                                                 contains($i))
1391         {
1392             $viacode[$i] = 'Noncharacter';
1393             $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1394             $printable[$i] = 0;
1395             $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1396                                                     containing_range($i)->end;
1397         }
1398         elsif ($gc-> table('Control')->contains($i)) {
1399             $viacode[$i] = 'Control';
1400             $annotate_char_type[$i] = $CONTROL_TYPE;
1401             $printable[$i] = 0;
1402             $end = 0x81 if $i == 0x80;  # Hard-code this one known case
1403         }
1404         elsif ($gc-> table('Unassigned')->contains($i)) {
1405             $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1406             $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1407             $printable[$i] = 0;
1408
1409             # Because we name the unassigned by the blocks they are in, it
1410             # can't go past the end of that block, and it also can't go past
1411             # the unassigned range it is in.  The special table makes sure
1412             # that the non-characters, which are unassigned, are separated
1413             # out.
1414             $end = min($block->containing_range($i)->end,
1415                        $unassigned_sans_noncharacters-> containing_range($i)->
1416                                                                          end);
1417         }
1418         else {
1419             Carp::my_carp_bug("Can't figure out how to annotate "
1420                               . sprintf("U+%04X", $i)
1421                               . ".  Proceeding anyway.");
1422             $viacode[$i] = 'UNKNOWN';
1423             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1424             $printable[$i] = 0;
1425         }
1426     }
1427
1428     # Here, has a name, but if it's one in which the code point number is
1429     # appended to the name, do that.
1430     elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1431         $viacode[$i] .= sprintf("-%04X", $i);
1432         $end = $perl_charname->containing_range($i)->end;
1433     }
1434
1435     # And here, has a name, but if it's a hangul syllable one, replace it with
1436     # the correct name from the Unicode algorithm
1437     elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1438         use integer;
1439         my $SIndex = $i - $SBase;
1440         my $L = $LBase + $SIndex / $NCount;
1441         my $V = $VBase + ($SIndex % $NCount) / $TCount;
1442         my $T = $TBase + $SIndex % $TCount;
1443         $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1444         $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1445         $end = $perl_charname->containing_range($i)->end;
1446     }
1447
1448     return if ! defined wantarray;
1449     return $i if ! defined $end;    # If not a range, return the input
1450
1451     # Save this whole range so can find the end point quickly
1452     $annotate_ranges->add_map($i, $end, $end);
1453
1454     return $end;
1455 }
1456
1457 # Commented code below should work on Perl 5.8.
1458 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1459 ## the native perl version of it (which is what would operate under miniperl)
1460 ## is extremely slow, as it does a string eval every call.
1461 #my $has_fast_scalar_util = $\18 !~ /miniperl/
1462 #                            && defined eval "require Scalar::Util";
1463 #
1464 #sub objaddr($) {
1465 #    # Returns the address of the blessed input object.  Uses the XS version if
1466 #    # available.  It doesn't check for blessedness because that would do a
1467 #    # string eval every call, and the program is structured so that this is
1468 #    # never called for a non-blessed object.
1469 #
1470 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1471 #
1472 #    # Check at least that is a ref.
1473 #    my $pkg = ref($_[0]) or return undef;
1474 #
1475 #    # Change to a fake package to defeat any overloaded stringify
1476 #    bless $_[0], 'main::Fake';
1477 #
1478 #    # Numifying a ref gives its address.
1479 #    my $addr = pack 'J', $_[0];
1480 #
1481 #    # Return to original class
1482 #    bless $_[0], $pkg;
1483 #    return $addr;
1484 #}
1485
1486 sub max ($$) {
1487     my $a = shift;
1488     my $b = shift;
1489     return $a if $a >= $b;
1490     return $b;
1491 }
1492
1493 sub min ($$) {
1494     my $a = shift;
1495     my $b = shift;
1496     return $a if $a <= $b;
1497     return $b;
1498 }
1499
1500 sub clarify_number ($) {
1501     # This returns the input number with underscores inserted every 3 digits
1502     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1503     # checked.
1504
1505     my $number = shift;
1506     my $pos = length($number) - 3;
1507     return $number if $pos <= 1;
1508     while ($pos > 0) {
1509         substr($number, $pos, 0) = '_';
1510         $pos -= 3;
1511     }
1512     return $number;
1513 }
1514
1515
1516 package Carp;
1517
1518 # These routines give a uniform treatment of messages in this program.  They
1519 # are placed in the Carp package to cause the stack trace to not include them,
1520 # although an alternative would be to use another package and set @CARP_NOT
1521 # for it.
1522
1523 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1524
1525 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1526 # and overload trying to load Scalar:Util under miniperl.  See
1527 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1528 undef $overload::VERSION;
1529
1530 sub my_carp {
1531     my $message = shift || "";
1532     my $nofold = shift || 0;
1533
1534     if ($message) {
1535         $message = main::join_lines($message);
1536         $message =~ s/^$0: *//;     # Remove initial program name
1537         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1538         $message = "\n$0: $message;";
1539
1540         # Fold the message with program name, semi-colon end punctuation
1541         # (which looks good with the message that carp appends to it), and a
1542         # hanging indent for continuation lines.
1543         $message = main::simple_fold($message, "", 4) unless $nofold;
1544         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1545                                     # appends is to the same line
1546     }
1547
1548     return $message if defined wantarray;   # If a caller just wants the msg
1549
1550     carp $message;
1551     return;
1552 }
1553
1554 sub my_carp_bug {
1555     # This is called when it is clear that the problem is caused by a bug in
1556     # this program.
1557
1558     my $message = shift;
1559     $message =~ s/^$0: *//;
1560     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1561     carp $message;
1562     return;
1563 }
1564
1565 sub carp_too_few_args {
1566     if (@_ != 2) {
1567         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1568         return;
1569     }
1570
1571     my $args_ref = shift;
1572     my $count = shift;
1573
1574     my_carp_bug("Need at least $count arguments to "
1575         . (caller 1)[3]
1576         . ".  Instead got: '"
1577         . join ', ', @$args_ref
1578         . "'.  No action taken.");
1579     return;
1580 }
1581
1582 sub carp_extra_args {
1583     my $args_ref = shift;
1584     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1585
1586     unless (ref $args_ref) {
1587         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1588         return;
1589     }
1590     my ($package, $file, $line) = caller;
1591     my $subroutine = (caller 1)[3];
1592
1593     my $list;
1594     if (ref $args_ref eq 'HASH') {
1595         foreach my $key (keys %$args_ref) {
1596             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1597         }
1598         $list = join ', ', each %{$args_ref};
1599     }
1600     elsif (ref $args_ref eq 'ARRAY') {
1601         foreach my $arg (@$args_ref) {
1602             $arg = $UNDEF unless defined $arg;
1603         }
1604         $list = join ', ', @$args_ref;
1605     }
1606     else {
1607         my_carp_bug("Can't cope with ref "
1608                 . ref($args_ref)
1609                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1610         return;
1611     }
1612
1613     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1614     return;
1615 }
1616
1617 package main;
1618
1619 { # Closure
1620
1621     # This program uses the inside-out method for objects, as recommended in
1622     # "Perl Best Practices".  This closure aids in generating those.  There
1623     # are two routines.  setup_package() is called once per package to set
1624     # things up, and then set_access() is called for each hash representing a
1625     # field in the object.  These routines arrange for the object to be
1626     # properly destroyed when no longer used, and for standard accessor
1627     # functions to be generated.  If you need more complex accessors, just
1628     # write your own and leave those accesses out of the call to set_access().
1629     # More details below.
1630
1631     my %constructor_fields; # fields that are to be used in constructors; see
1632                             # below
1633
1634     # The values of this hash will be the package names as keys to other
1635     # hashes containing the name of each field in the package as keys, and
1636     # references to their respective hashes as values.
1637     my %package_fields;
1638
1639     sub setup_package {
1640         # Sets up the package, creating standard DESTROY and dump methods
1641         # (unless already defined).  The dump method is used in debugging by
1642         # simple_dumper().
1643         # The optional parameters are:
1644         #   a)  a reference to a hash, that gets populated by later
1645         #       set_access() calls with one of the accesses being
1646         #       'constructor'.  The caller can then refer to this, but it is
1647         #       not otherwise used by these two routines.
1648         #   b)  a reference to a callback routine to call during destruction
1649         #       of the object, before any fields are actually destroyed
1650
1651         my %args = @_;
1652         my $constructor_ref = delete $args{'Constructor_Fields'};
1653         my $destroy_callback = delete $args{'Destroy_Callback'};
1654         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1655
1656         my %fields;
1657         my $package = (caller)[0];
1658
1659         $package_fields{$package} = \%fields;
1660         $constructor_fields{$package} = $constructor_ref;
1661
1662         unless ($package->can('DESTROY')) {
1663             my $destroy_name = "${package}::DESTROY";
1664             no strict "refs";
1665
1666             # Use typeglob to give the anonymous subroutine the name we want
1667             *$destroy_name = sub {
1668                 my $self = shift;
1669                 my $addr = do { no overloading; pack 'J', $self; };
1670
1671                 $self->$destroy_callback if $destroy_callback;
1672                 foreach my $field (keys %{$package_fields{$package}}) {
1673                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1674                     delete $package_fields{$package}{$field}{$addr};
1675                 }
1676                 return;
1677             }
1678         }
1679
1680         unless ($package->can('dump')) {
1681             my $dump_name = "${package}::dump";
1682             no strict "refs";
1683             *$dump_name = sub {
1684                 my $self = shift;
1685                 return dump_inside_out($self, $package_fields{$package}, @_);
1686             }
1687         }
1688         return;
1689     }
1690
1691     sub set_access {
1692         # Arrange for the input field to be garbage collected when no longer
1693         # needed.  Also, creates standard accessor functions for the field
1694         # based on the optional parameters-- none if none of these parameters:
1695         #   'addable'    creates an 'add_NAME()' accessor function.
1696         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1697         #                function.
1698         #   'settable'   creates a 'set_NAME()' accessor function.
1699         #   'constructor' doesn't create an accessor function, but adds the
1700         #                field to the hash that was previously passed to
1701         #                setup_package();
1702         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1703         # 'add' etc. all mean 'addable'.
1704         # The read accessor function will work on both array and scalar
1705         # values.  If another accessor in the parameter list is 'a', the read
1706         # access assumes an array.  You can also force it to be array access
1707         # by specifying 'readable_array' instead of 'readable'
1708         #
1709         # A sort-of 'protected' access can be set-up by preceding the addable,
1710         # readable or settable with some initial portion of 'protected_' (but,
1711         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1712         # "protection" is only by convention.  All that happens is that the
1713         # accessor functions' names begin with an underscore.  So instead of
1714         # calling set_foo, the call is _set_foo.  (Real protection could be
1715         # accomplished by having a new subroutine, end_package, called at the
1716         # end of each package, and then storing the __LINE__ ranges and
1717         # checking them on every accessor.  But that is way overkill.)
1718
1719         # We create anonymous subroutines as the accessors and then use
1720         # typeglobs to assign them to the proper package and name
1721
1722         my $name = shift;   # Name of the field
1723         my $field = shift;  # Reference to the inside-out hash containing the
1724                             # field
1725
1726         my $package = (caller)[0];
1727
1728         if (! exists $package_fields{$package}) {
1729             croak "$0: Must call 'setup_package' before 'set_access'";
1730         }
1731
1732         # Stash the field so DESTROY can get it.
1733         $package_fields{$package}{$name} = $field;
1734
1735         # Remaining arguments are the accessors.  For each...
1736         foreach my $access (@_) {
1737             my $access = lc $access;
1738
1739             my $protected = "";
1740
1741             # Match the input as far as it goes.
1742             if ($access =~ /^(p[^_]*)_/) {
1743                 $protected = $1;
1744                 if (substr('protected_', 0, length $protected)
1745                     eq $protected)
1746                 {
1747
1748                     # Add 1 for the underscore not included in $protected
1749                     $access = substr($access, length($protected) + 1);
1750                     $protected = '_';
1751                 }
1752                 else {
1753                     $protected = "";
1754                 }
1755             }
1756
1757             if (substr('addable', 0, length $access) eq $access) {
1758                 my $subname = "${package}::${protected}add_$name";
1759                 no strict "refs";
1760
1761                 # add_ accessor.  Don't add if already there, which we
1762                 # determine using 'eq' for scalars and '==' otherwise.
1763                 *$subname = sub {
1764                     use strict "refs";
1765                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1766                     my $self = shift;
1767                     my $value = shift;
1768                     my $addr = do { no overloading; pack 'J', $self; };
1769                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1770                     if (ref $value) {
1771                         return if grep { $value == $_ } @{$field->{$addr}};
1772                     }
1773                     else {
1774                         return if grep { $value eq $_ } @{$field->{$addr}};
1775                     }
1776                     push @{$field->{$addr}}, $value;
1777                     return;
1778                 }
1779             }
1780             elsif (substr('constructor', 0, length $access) eq $access) {
1781                 if ($protected) {
1782                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
1783                 }
1784                 else {
1785                     $constructor_fields{$package}{$name} = $field;
1786                 }
1787             }
1788             elsif (substr('readable_array', 0, length $access) eq $access) {
1789
1790                 # Here has read access.  If one of the other parameters for
1791                 # access is array, or this one specifies array (by being more
1792                 # than just 'readable_'), then create a subroutine that
1793                 # assumes the data is an array.  Otherwise just a scalar
1794                 my $subname = "${package}::${protected}$name";
1795                 if (grep { /^a/i } @_
1796                     or length($access) > length('readable_'))
1797                 {
1798                     no strict "refs";
1799                     *$subname = sub {
1800                         use strict "refs";
1801                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1802                         my $addr = do { no overloading; pack 'J', $_[0]; };
1803                         if (ref $field->{$addr} ne 'ARRAY') {
1804                             my $type = ref $field->{$addr};
1805                             $type = 'scalar' unless $type;
1806                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1807                             return;
1808                         }
1809                         return scalar @{$field->{$addr}} unless wantarray;
1810
1811                         # Make a copy; had problems with caller modifying the
1812                         # original otherwise
1813                         my @return = @{$field->{$addr}};
1814                         return @return;
1815                     }
1816                 }
1817                 else {
1818
1819                     # Here not an array value, a simpler function.
1820                     no strict "refs";
1821                     *$subname = sub {
1822                         use strict "refs";
1823                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1824                         no overloading;
1825                         return $field->{pack 'J', $_[0]};
1826                     }
1827                 }
1828             }
1829             elsif (substr('settable', 0, length $access) eq $access) {
1830                 my $subname = "${package}::${protected}set_$name";
1831                 no strict "refs";
1832                 *$subname = sub {
1833                     use strict "refs";
1834                     if (main::DEBUG) {
1835                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1836                         Carp::carp_extra_args(\@_) if @_ > 2;
1837                     }
1838                     # $self is $_[0]; $value is $_[1]
1839                     no overloading;
1840                     $field->{pack 'J', $_[0]} = $_[1];
1841                     return;
1842                 }
1843             }
1844             else {
1845                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
1846             }
1847         }
1848         return;
1849     }
1850 }
1851
1852 package Input_file;
1853
1854 # All input files use this object, which stores various attributes about them,
1855 # and provides for convenient, uniform handling.  The run method wraps the
1856 # processing.  It handles all the bookkeeping of opening, reading, and closing
1857 # the file, returning only significant input lines.
1858 #
1859 # Each object gets a handler which processes the body of the file, and is
1860 # called by run().  Most should use the generic, default handler, which has
1861 # code scrubbed to handle things you might not expect.  A handler should
1862 # basically be a while(next_line()) {...} loop.
1863 #
1864 # You can also set up handlers to
1865 #   1) call before the first line is read for pre processing
1866 #   2) call to adjust each line of the input before the main handler gets them
1867 #   3) call upon EOF before the main handler exits its loop
1868 #   4) call at the end for post processing
1869 #
1870 # $_ is used to store the input line, and is to be filtered by the
1871 # each_line_handler()s.  So, if the format of the line is not in the desired
1872 # format for the main handler, these are used to do that adjusting.  They can
1873 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1874 # so the $_ output of one is used as the input to the next.  None of the other
1875 # handlers are stackable, but could easily be changed to be so.
1876 #
1877 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1878 # which insert the parameters as lines to be processed before the next input
1879 # file line is read.  This allows the EOF handler to flush buffers, for
1880 # example.  The difference between the two routines is that the lines inserted
1881 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
1882 # called it from such a handler, you would get infinite recursion.)  Lines
1883 # inserted by insert_adjusted_lines() go directly to the main handler without
1884 # any adjustments.  If the  post-processing handler calls any of these, there
1885 # will be no effect.  Some error checking for these conditions could be added,
1886 # but it hasn't been done.
1887 #
1888 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1889 # to prevent further processing of the line.  This routine will output the
1890 # message as a warning once, and then keep a count of the lines that have the
1891 # same message, and output that count at the end of the file's processing.
1892 # This keeps the number of messages down to a manageable amount.
1893 #
1894 # get_missings() should be called to retrieve any @missing input lines.
1895 # Messages will be raised if this isn't done if the options aren't to ignore
1896 # missings.
1897
1898 sub trace { return main::trace(@_); }
1899
1900 { # Closure
1901     # Keep track of fields that are to be put into the constructor.
1902     my %constructor_fields;
1903
1904     main::setup_package(Constructor_Fields => \%constructor_fields);
1905
1906     my %file; # Input file name, required
1907     main::set_access('file', \%file, qw{ c r });
1908
1909     my %first_released; # Unicode version file was first released in, required
1910     main::set_access('first_released', \%first_released, qw{ c r });
1911
1912     my %handler;    # Subroutine to process the input file, defaults to
1913                     # 'process_generic_property_file'
1914     main::set_access('handler', \%handler, qw{ c });
1915
1916     my %property;
1917     # name of property this file is for.  defaults to none, meaning not
1918     # applicable, or is otherwise determinable, for example, from each line.
1919     main::set_access('property', \%property, qw{ c });
1920
1921     my %optional;
1922     # If this is true, the file is optional.  If not present, no warning is
1923     # output.  If it is present, the string given by this parameter is
1924     # evaluated, and if false the file is not processed.
1925     main::set_access('optional', \%optional, 'c', 'r');
1926
1927     my %non_skip;
1928     # This is used for debugging, to skip processing of all but a few input
1929     # files.  Add 'non_skip => 1' to the constructor for those files you want
1930     # processed when you set the $debug_skip global.
1931     main::set_access('non_skip', \%non_skip, 'c');
1932
1933     my %skip;
1934     # This is used to skip processing of this input file semi-permanently.
1935     # It is used for files that we aren't planning to process anytime soon,
1936     # but want to allow to be in the directory and not raise a message that we
1937     # are not handling.  Mostly for test files.  This is in contrast to the
1938     # non_skip element, which is supposed to be used very temporarily for
1939     # debugging.  Sets 'optional' to 1
1940     main::set_access('skip', \%skip, 'c');
1941
1942     my %each_line_handler;
1943     # list of subroutines to look at and filter each non-comment line in the
1944     # file.  defaults to none.  The subroutines are called in order, each is
1945     # to adjust $_ for the next one, and the final one adjusts it for
1946     # 'handler'
1947     main::set_access('each_line_handler', \%each_line_handler, 'c');
1948
1949     my %has_missings_defaults;
1950     # ? Are there lines in the file giving default values for code points
1951     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
1952     # the norm, but IGNORED means it has such lines, but the handler doesn't
1953     # use them.  Having these three states allows us to catch changes to the
1954     # UCD that this program should track
1955     main::set_access('has_missings_defaults',
1956                                         \%has_missings_defaults, qw{ c r });
1957
1958     my %pre_handler;
1959     # Subroutine to call before doing anything else in the file.  If undef, no
1960     # such handler is called.
1961     main::set_access('pre_handler', \%pre_handler, qw{ c });
1962
1963     my %eof_handler;
1964     # Subroutine to call upon getting an EOF on the input file, but before
1965     # that is returned to the main handler.  This is to allow buffers to be
1966     # flushed.  The handler is expected to call insert_lines() or
1967     # insert_adjusted() with the buffered material
1968     main::set_access('eof_handler', \%eof_handler, qw{ c r });
1969
1970     my %post_handler;
1971     # Subroutine to call after all the lines of the file are read in and
1972     # processed.  If undef, no such handler is called.
1973     main::set_access('post_handler', \%post_handler, qw{ c });
1974
1975     my %progress_message;
1976     # Message to print to display progress in lieu of the standard one
1977     main::set_access('progress_message', \%progress_message, qw{ c });
1978
1979     my %handle;
1980     # cache open file handle, internal.  Is undef if file hasn't been
1981     # processed at all, empty if has;
1982     main::set_access('handle', \%handle);
1983
1984     my %added_lines;
1985     # cache of lines added virtually to the file, internal
1986     main::set_access('added_lines', \%added_lines);
1987
1988     my %errors;
1989     # cache of errors found, internal
1990     main::set_access('errors', \%errors);
1991
1992     my %missings;
1993     # storage of '@missing' defaults lines
1994     main::set_access('missings', \%missings);
1995
1996     sub new {
1997         my $class = shift;
1998
1999         my $self = bless \do{ my $anonymous_scalar }, $class;
2000         my $addr = do { no overloading; pack 'J', $self; };
2001
2002         # Set defaults
2003         $handler{$addr} = \&main::process_generic_property_file;
2004         $non_skip{$addr} = 0;
2005         $skip{$addr} = 0;
2006         $has_missings_defaults{$addr} = $NO_DEFAULTS;
2007         $handle{$addr} = undef;
2008         $added_lines{$addr} = [ ];
2009         $each_line_handler{$addr} = [ ];
2010         $errors{$addr} = { };
2011         $missings{$addr} = [ ];
2012
2013         # Two positional parameters.
2014         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2015         $file{$addr} = main::internal_file_to_platform(shift);
2016         $first_released{$addr} = shift;
2017
2018         # The rest of the arguments are key => value pairs
2019         # %constructor_fields has been set up earlier to list all possible
2020         # ones.  Either set or push, depending on how the default has been set
2021         # up just above.
2022         my %args = @_;
2023         foreach my $key (keys %args) {
2024             my $argument = $args{$key};
2025
2026             # Note that the fields are the lower case of the constructor keys
2027             my $hash = $constructor_fields{lc $key};
2028             if (! defined $hash) {
2029                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2030                 next;
2031             }
2032             if (ref $hash->{$addr} eq 'ARRAY') {
2033                 if (ref $argument eq 'ARRAY') {
2034                     foreach my $argument (@{$argument}) {
2035                         next if ! defined $argument;
2036                         push @{$hash->{$addr}}, $argument;
2037                     }
2038                 }
2039                 else {
2040                     push @{$hash->{$addr}}, $argument if defined $argument;
2041                 }
2042             }
2043             else {
2044                 $hash->{$addr} = $argument;
2045             }
2046             delete $args{$key};
2047         };
2048
2049         # If the file has a property for it, it means that the property is not
2050         # listed in the file's entries.  So add a handler to the list of line
2051         # handlers to insert the property name into the lines, to provide a
2052         # uniform interface to the final processing subroutine.
2053         # the final code doesn't have to worry about that.
2054         if ($property{$addr}) {
2055             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2056         }
2057
2058         if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2059             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2060         }
2061
2062         $optional{$addr} = 1 if $skip{$addr};
2063
2064         return $self;
2065     }
2066
2067
2068     use overload
2069         fallback => 0,
2070         qw("") => "_operator_stringify",
2071         "." => \&main::_operator_dot,
2072     ;
2073
2074     sub _operator_stringify {
2075         my $self = shift;
2076
2077         return __PACKAGE__ . " object for " . $self->file;
2078     }
2079
2080     # flag to make sure extracted files are processed early
2081     my $seen_non_extracted_non_age = 0;
2082
2083     sub run {
2084         # Process the input object $self.  This opens and closes the file and
2085         # calls all the handlers for it.  Currently,  this can only be called
2086         # once per file, as it destroy's the EOF handler
2087
2088         my $self = shift;
2089         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2090
2091         my $addr = do { no overloading; pack 'J', $self; };
2092
2093         my $file = $file{$addr};
2094
2095         # Don't process if not expecting this file (because released later
2096         # than this Unicode version), and isn't there.  This means if someone
2097         # copies it into an earlier version's directory, we will go ahead and
2098         # process it.
2099         return if $first_released{$addr} gt $v_version && ! -e $file;
2100
2101         # If in debugging mode and this file doesn't have the non-skip
2102         # flag set, and isn't one of the critical files, skip it.
2103         if ($debug_skip
2104             && $first_released{$addr} ne v0
2105             && ! $non_skip{$addr})
2106         {
2107             print "Skipping $file in debugging\n" if $verbosity;
2108             return;
2109         }
2110
2111         # File could be optional
2112         if ($optional{$addr}) {
2113             return unless -e $file;
2114             my $result = eval $optional{$addr};
2115             if (! defined $result) {
2116                 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}.  $file Skipped.");
2117                 return;
2118             }
2119             if (! $result) {
2120                 if ($verbosity) {
2121                     print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2122                 }
2123                 return;
2124             }
2125         }
2126
2127         if (! defined $file || ! -e $file) {
2128
2129             # If the file doesn't exist, see if have internal data for it
2130             # (based on first_released being 0).
2131             if ($first_released{$addr} eq v0) {
2132                 $handle{$addr} = 'pretend_is_open';
2133             }
2134             else {
2135                 if (! $optional{$addr}  # File could be optional
2136                     && $v_version ge $first_released{$addr})
2137                 {
2138                     print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2139                 }
2140                 return;
2141             }
2142         }
2143         else {
2144
2145             # Here, the file exists.  Some platforms may change the case of
2146             # its name
2147             if ($seen_non_extracted_non_age) {
2148                 if ($file =~ /$EXTRACTED/i) {
2149                     Carp::my_carp_bug(join_lines(<<END
2150 $file should be processed just after the 'Prop...Alias' files, and before
2151 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2152 have subtle problems
2153 END
2154                     ));
2155                 }
2156             }
2157             elsif ($EXTRACTED_DIR
2158                     && $first_released{$addr} ne v0
2159                     && $file !~ /$EXTRACTED/i
2160                     && lc($file) ne 'dage.txt')
2161             {
2162                 # We don't set this (by the 'if' above) if we have no
2163                 # extracted directory, so if running on an early version,
2164                 # this test won't work.  Not worth worrying about.
2165                 $seen_non_extracted_non_age = 1;
2166             }
2167
2168             # And mark the file as having being processed, and warn if it
2169             # isn't a file we are expecting.  As we process the files,
2170             # they are deleted from the hash, so any that remain at the
2171             # end of the program are files that we didn't process.
2172             my $fkey = File::Spec->rel2abs($file);
2173             my $expecting = delete $potential_files{$fkey};
2174             $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
2175             Carp::my_carp("Was not expecting '$file'.") if
2176                     ! $expecting
2177                     && ! defined $handle{$addr};
2178
2179             # Having deleted from expected files, we can quit if not to do
2180             # anything.  Don't print progress unless really want verbosity
2181             if ($skip{$addr}) {
2182                 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2183                 return;
2184             }
2185
2186             # Open the file, converting the slashes used in this program
2187             # into the proper form for the OS
2188             my $file_handle;
2189             if (not open $file_handle, "<", $file) {
2190                 Carp::my_carp("Can't open $file.  Skipping: $!");
2191                 return 0;
2192             }
2193             $handle{$addr} = $file_handle; # Cache the open file handle
2194         }
2195
2196         if ($verbosity >= $PROGRESS) {
2197             if ($progress_message{$addr}) {
2198                 print "$progress_message{$addr}\n";
2199             }
2200             else {
2201                 # If using a virtual file, say so.
2202                 print "Processing ", (-e $file)
2203                                        ? $file
2204                                        : "substitute $file",
2205                                      "\n";
2206             }
2207         }
2208
2209
2210         # Call any special handler for before the file.
2211         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2212
2213         # Then the main handler
2214         &{$handler{$addr}}($self);
2215
2216         # Then any special post-file handler.
2217         &{$post_handler{$addr}}($self) if $post_handler{$addr};
2218
2219         # If any errors have been accumulated, output the counts (as the first
2220         # error message in each class was output when it was encountered).
2221         if ($errors{$addr}) {
2222             my $total = 0;
2223             my $types = 0;
2224             foreach my $error (keys %{$errors{$addr}}) {
2225                 $total += $errors{$addr}->{$error};
2226                 delete $errors{$addr}->{$error};
2227                 $types++;
2228             }
2229             if ($total > 1) {
2230                 my $message
2231                         = "A total of $total lines had errors in $file.  ";
2232
2233                 $message .= ($types == 1)
2234                             ? '(Only the first one was displayed.)'
2235                             : '(Only the first of each type was displayed.)';
2236                 Carp::my_carp($message);
2237             }
2238         }
2239
2240         if (@{$missings{$addr}}) {
2241             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2242         }
2243
2244         # If a real file handle, close it.
2245         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2246                                                         ref $handle{$addr};
2247         $handle{$addr} = "";   # Uses empty to indicate that has already seen
2248                                # the file, as opposed to undef
2249         return;
2250     }
2251
2252     sub next_line {
2253         # Sets $_ to be the next logical input line, if any.  Returns non-zero
2254         # if such a line exists.  'logical' means that any lines that have
2255         # been added via insert_lines() will be returned in $_ before the file
2256         # is read again.
2257
2258         my $self = shift;
2259         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2260
2261         my $addr = do { no overloading; pack 'J', $self; };
2262
2263         # Here the file is open (or if the handle is not a ref, is an open
2264         # 'virtual' file).  Get the next line; any inserted lines get priority
2265         # over the file itself.
2266         my $adjusted;
2267
2268         LINE:
2269         while (1) { # Loop until find non-comment, non-empty line
2270             #local $to_trace = 1 if main::DEBUG;
2271             my $inserted_ref = shift @{$added_lines{$addr}};
2272             if (defined $inserted_ref) {
2273                 ($adjusted, $_) = @{$inserted_ref};
2274                 trace $adjusted, $_ if main::DEBUG && $to_trace;
2275                 return 1 if $adjusted;
2276             }
2277             else {
2278                 last if ! ref $handle{$addr}; # Don't read unless is real file
2279                 last if ! defined ($_ = readline $handle{$addr});
2280             }
2281             chomp;
2282             trace $_ if main::DEBUG && $to_trace;
2283
2284             # See if this line is the comment line that defines what property
2285             # value that code points that are not listed in the file should
2286             # have.  The format or existence of these lines is not guaranteed
2287             # by Unicode since they are comments, but the documentation says
2288             # that this was added for machine-readability, so probably won't
2289             # change.  This works starting in Unicode Version 5.0.  They look
2290             # like:
2291             #
2292             # @missing: 0000..10FFFF; Not_Reordered
2293             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2294             # @missing: 0000..10FFFF; ; NaN
2295             #
2296             # Save the line for a later get_missings() call.
2297             if (/$missing_defaults_prefix/) {
2298                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2299                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2300                 }
2301                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2302                     my @defaults = split /\s* ; \s*/x, $_;
2303
2304                     # The first field is the @missing, which ends in a
2305                     # semi-colon, so can safely shift.
2306                     shift @defaults;
2307
2308                     # Some of these lines may have empty field placeholders
2309                     # which get in the way.  An example is:
2310                     # @missing: 0000..10FFFF; ; NaN
2311                     # Remove them.  Process starting from the top so the
2312                     # splice doesn't affect things still to be looked at.
2313                     for (my $i = @defaults - 1; $i >= 0; $i--) {
2314                         next if $defaults[$i] ne "";
2315                         splice @defaults, $i, 1;
2316                     }
2317
2318                     # What's left should be just the property (maybe) and the
2319                     # default.  Having only one element means it doesn't have
2320                     # the property.
2321                     my $default;
2322                     my $property;
2323                     if (@defaults >= 1) {
2324                         if (@defaults == 1) {
2325                             $default = $defaults[0];
2326                         }
2327                         else {
2328                             $property = $defaults[0];
2329                             $default = $defaults[1];
2330                         }
2331                     }
2332
2333                     if (@defaults < 1
2334                         || @defaults > 2
2335                         || ($default =~ /^</
2336                             && $default !~ /^<code *point>$/i
2337                             && $default !~ /^<none>$/i))
2338                     {
2339                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2340                     }
2341                     else {
2342
2343                         # If the property is missing from the line, it should
2344                         # be the one for the whole file
2345                         $property = $property{$addr} if ! defined $property;
2346
2347                         # Change <none> to the null string, which is what it
2348                         # really means.  If the default is the code point
2349                         # itself, set it to <code point>, which is what
2350                         # Unicode uses (but sometimes they've forgotten the
2351                         # space)
2352                         if ($default =~ /^<none>$/i) {
2353                             $default = "";
2354                         }
2355                         elsif ($default =~ /^<code *point>$/i) {
2356                             $default = $CODE_POINT;
2357                         }
2358
2359                         # Store them as a sub-arrays with both components.
2360                         push @{$missings{$addr}}, [ $default, $property ];
2361                     }
2362                 }
2363
2364                 # There is nothing for the caller to process on this comment
2365                 # line.
2366                 next;
2367             }
2368
2369             # Remove comments and trailing space, and skip this line if the
2370             # result is empty
2371             s/#.*//;
2372             s/\s+$//;
2373             next if /^$/;
2374
2375             # Call any handlers for this line, and skip further processing of
2376             # the line if the handler sets the line to null.
2377             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2378                 &{$sub_ref}($self);
2379                 next LINE if /^$/;
2380             }
2381
2382             # Here the line is ok.  return success.
2383             return 1;
2384         } # End of looping through lines.
2385
2386         # If there is an EOF handler, call it (only once) and if it generates
2387         # more lines to process go back in the loop to handle them.
2388         if ($eof_handler{$addr}) {
2389             &{$eof_handler{$addr}}($self);
2390             $eof_handler{$addr} = "";   # Currently only get one shot at it.
2391             goto LINE if $added_lines{$addr};
2392         }
2393
2394         # Return failure -- no more lines.
2395         return 0;
2396
2397     }
2398
2399 #   Not currently used, not fully tested.
2400 #    sub peek {
2401 #        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2402 #        # record.  Not callable from an each_line_handler(), nor does it call
2403 #        # an each_line_handler() on the line.
2404 #
2405 #        my $self = shift;
2406 #        my $addr = do { no overloading; pack 'J', $self; };
2407 #
2408 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
2409 #            my ($adjusted, $line) = @{$inserted_ref};
2410 #            next if $adjusted;
2411 #
2412 #            # Remove comments and trailing space, and return a non-empty
2413 #            # resulting line
2414 #            $line =~ s/#.*//;
2415 #            $line =~ s/\s+$//;
2416 #            return $line if $line ne "";
2417 #        }
2418 #
2419 #        return if ! ref $handle{$addr}; # Don't read unless is real file
2420 #        while (1) { # Loop until find non-comment, non-empty line
2421 #            local $to_trace = 1 if main::DEBUG;
2422 #            trace $_ if main::DEBUG && $to_trace;
2423 #            return if ! defined (my $line = readline $handle{$addr});
2424 #            chomp $line;
2425 #            push @{$added_lines{$addr}}, [ 0, $line ];
2426 #
2427 #            $line =~ s/#.*//;
2428 #            $line =~ s/\s+$//;
2429 #            return $line if $line ne "";
2430 #        }
2431 #
2432 #        return;
2433 #    }
2434
2435
2436     sub insert_lines {
2437         # Lines can be inserted so that it looks like they were in the input
2438         # file at the place it was when this routine is called.  See also
2439         # insert_adjusted_lines().  Lines inserted via this routine go through
2440         # any each_line_handler()
2441
2442         my $self = shift;
2443
2444         # Each inserted line is an array, with the first element being 0 to
2445         # indicate that this line hasn't been adjusted, and needs to be
2446         # processed.
2447         no overloading;
2448         push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2449         return;
2450     }
2451
2452     sub insert_adjusted_lines {
2453         # Lines can be inserted so that it looks like they were in the input
2454         # file at the place it was when this routine is called.  See also
2455         # insert_lines().  Lines inserted via this routine are already fully
2456         # adjusted, ready to be processed; each_line_handler()s handlers will
2457         # not be called.  This means this is not a completely general
2458         # facility, as only the last each_line_handler on the stack should
2459         # call this.  It could be made more general, by passing to each of the
2460         # line_handlers their position on the stack, which they would pass on
2461         # to this routine, and that would replace the boolean first element in
2462         # the anonymous array pushed here, so that the next_line routine could
2463         # use that to call only those handlers whose index is after it on the
2464         # stack.  But this is overkill for what is needed now.
2465
2466         my $self = shift;
2467         trace $_[0] if main::DEBUG && $to_trace;
2468
2469         # Each inserted line is an array, with the first element being 1 to
2470         # indicate that this line has been adjusted
2471         no overloading;
2472         push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2473         return;
2474     }
2475
2476     sub get_missings {
2477         # Returns the stored up @missings lines' values, and clears the list.
2478         # The values are in an array, consisting of the default in the first
2479         # element, and the property in the 2nd.  However, since these lines
2480         # can be stacked up, the return is an array of all these arrays.
2481
2482         my $self = shift;
2483         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2484
2485         my $addr = do { no overloading; pack 'J', $self; };
2486
2487         # If not accepting a list return, just return the first one.
2488         return shift @{$missings{$addr}} unless wantarray;
2489
2490         my @return = @{$missings{$addr}};
2491         undef @{$missings{$addr}};
2492         return @return;
2493     }
2494
2495     sub _insert_property_into_line {
2496         # Add a property field to $_, if this file requires it.
2497
2498         my $self = shift;
2499         my $addr = do { no overloading; pack 'J', $self; };
2500         my $property = $property{$addr};
2501         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2502
2503         $_ =~ s/(;|$)/; $property$1/;
2504         return;
2505     }
2506
2507     sub carp_bad_line {
2508         # Output consistent error messages, using either a generic one, or the
2509         # one given by the optional parameter.  To avoid gazillions of the
2510         # same message in case the syntax of a  file is way off, this routine
2511         # only outputs the first instance of each message, incrementing a
2512         # count so the totals can be output at the end of the file.
2513
2514         my $self = shift;
2515         my $message = shift;
2516         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2517
2518         my $addr = do { no overloading; pack 'J', $self; };
2519
2520         $message = 'Unexpected line' unless $message;
2521
2522         # No trailing punctuation so as to fit with our addenda.
2523         $message =~ s/[.:;,]$//;
2524
2525         # If haven't seen this exact message before, output it now.  Otherwise
2526         # increment the count of how many times it has occurred
2527         unless ($errors{$addr}->{$message}) {
2528             Carp::my_carp("$message in '$_' in "
2529                             . $file{$addr}
2530                             . " at line $..  Skipping this line;");
2531             $errors{$addr}->{$message} = 1;
2532         }
2533         else {
2534             $errors{$addr}->{$message}++;
2535         }
2536
2537         # Clear the line to prevent any further (meaningful) processing of it.
2538         $_ = "";
2539
2540         return;
2541     }
2542 } # End closure
2543
2544 package Multi_Default;
2545
2546 # Certain properties in early versions of Unicode had more than one possible
2547 # default for code points missing from the files.  In these cases, one
2548 # default applies to everything left over after all the others are applied,
2549 # and for each of the others, there is a description of which class of code
2550 # points applies to it.  This object helps implement this by storing the
2551 # defaults, and for all but that final default, an eval string that generates
2552 # the class that it applies to.
2553
2554
2555 {   # Closure
2556
2557     main::setup_package();
2558
2559     my %class_defaults;
2560     # The defaults structure for the classes
2561     main::set_access('class_defaults', \%class_defaults);
2562
2563     my %other_default;
2564     # The default that applies to everything left over.
2565     main::set_access('other_default', \%other_default, 'r');
2566
2567
2568     sub new {
2569         # The constructor is called with default => eval pairs, terminated by
2570         # the left-over default. e.g.
2571         # Multi_Default->new(
2572         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2573         #               -  0x200D',
2574         #        'R' => 'some other expression that evaluates to code points',
2575         #        .
2576         #        .
2577         #        .
2578         #        'U'));
2579
2580         my $class = shift;
2581
2582         my $self = bless \do{my $anonymous_scalar}, $class;
2583         my $addr = do { no overloading; pack 'J', $self; };
2584
2585         while (@_ > 1) {
2586             my $default = shift;
2587             my $eval = shift;
2588             $class_defaults{$addr}->{$default} = $eval;
2589         }
2590
2591         $other_default{$addr} = shift;
2592
2593         return $self;
2594     }
2595
2596     sub get_next_defaults {
2597         # Iterates and returns the next class of defaults.
2598         my $self = shift;
2599         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2600
2601         my $addr = do { no overloading; pack 'J', $self; };
2602
2603         return each %{$class_defaults{$addr}};
2604     }
2605 }
2606
2607 package Alias;
2608
2609 # An alias is one of the names that a table goes by.  This class defines them
2610 # including some attributes.  Everything is currently setup in the
2611 # constructor.
2612
2613
2614 {   # Closure
2615
2616     main::setup_package();
2617
2618     my %name;
2619     main::set_access('name', \%name, 'r');
2620
2621     my %loose_match;
2622     # Should this name match loosely or not.
2623     main::set_access('loose_match', \%loose_match, 'r');
2624
2625     my %make_pod_entry;
2626     # Some aliases should not get their own entries because they are covered
2627     # by a wild-card, and some we want to discourage use of.  Binary
2628     main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2629
2630     my %status;
2631     # Aliases have a status, like deprecated, or even suppressed (which means
2632     # they don't appear in documentation).  Enum
2633     main::set_access('status', \%status, 'r');
2634
2635     my %externally_ok;
2636     # Similarly, some aliases should not be considered as usable ones for
2637     # external use, such as file names, or we don't want documentation to
2638     # recommend them.  Boolean
2639     main::set_access('externally_ok', \%externally_ok, 'r');
2640
2641     sub new {
2642         my $class = shift;
2643
2644         my $self = bless \do { my $anonymous_scalar }, $class;
2645         my $addr = do { no overloading; pack 'J', $self; };
2646
2647         $name{$addr} = shift;
2648         $loose_match{$addr} = shift;
2649         $make_pod_entry{$addr} = shift;
2650         $externally_ok{$addr} = shift;
2651         $status{$addr} = shift;
2652
2653         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2654
2655         # Null names are never ok externally
2656         $externally_ok{$addr} = 0 if $name{$addr} eq "";
2657
2658         return $self;
2659     }
2660 }
2661
2662 package Range;
2663
2664 # A range is the basic unit for storing code points, and is described in the
2665 # comments at the beginning of the program.  Each range has a starting code
2666 # point; an ending code point (not less than the starting one); a value
2667 # that applies to every code point in between the two end-points, inclusive;
2668 # and an enum type that applies to the value.  The type is for the user's
2669 # convenience, and has no meaning here, except that a non-zero type is
2670 # considered to not obey the normal Unicode rules for having standard forms.
2671 #
2672 # The same structure is used for both map and match tables, even though in the
2673 # latter, the value (and hence type) is irrelevant and could be used as a
2674 # comment.  In map tables, the value is what all the code points in the range
2675 # map to.  Type 0 values have the standardized version of the value stored as
2676 # well, so as to not have to recalculate it a lot.
2677
2678 sub trace { return main::trace(@_); }
2679
2680 {   # Closure
2681
2682     main::setup_package();
2683
2684     my %start;
2685     main::set_access('start', \%start, 'r', 's');
2686
2687     my %end;
2688     main::set_access('end', \%end, 'r', 's');
2689
2690     my %value;
2691     main::set_access('value', \%value, 'r');
2692
2693     my %type;
2694     main::set_access('type', \%type, 'r');
2695
2696     my %standard_form;
2697     # The value in internal standard form.  Defined only if the type is 0.
2698     main::set_access('standard_form', \%standard_form);
2699
2700     # Note that if these fields change, the dump() method should as well
2701
2702     sub new {
2703         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2704         my $class = shift;
2705
2706         my $self = bless \do { my $anonymous_scalar }, $class;
2707         my $addr = do { no overloading; pack 'J', $self; };
2708
2709         $start{$addr} = shift;
2710         $end{$addr} = shift;
2711
2712         my %args = @_;
2713
2714         my $value = delete $args{'Value'};  # Can be 0
2715         $value = "" unless defined $value;
2716         $value{$addr} = $value;
2717
2718         $type{$addr} = delete $args{'Type'} || 0;
2719
2720         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2721
2722         if (! $type{$addr}) {
2723             $standard_form{$addr} = main::standardize($value);
2724         }
2725
2726         return $self;
2727     }
2728
2729     use overload
2730         fallback => 0,
2731         qw("") => "_operator_stringify",
2732         "." => \&main::_operator_dot,
2733     ;
2734
2735     sub _operator_stringify {
2736         my $self = shift;
2737         my $addr = do { no overloading; pack 'J', $self; };
2738
2739         # Output it like '0041..0065 (value)'
2740         my $return = sprintf("%04X", $start{$addr})
2741                         .  '..'
2742                         . sprintf("%04X", $end{$addr});
2743         my $value = $value{$addr};
2744         my $type = $type{$addr};
2745         $return .= ' (';
2746         $return .= "$value";
2747         $return .= ", Type=$type" if $type != 0;
2748         $return .= ')';
2749
2750         return $return;
2751     }
2752
2753     sub standard_form {
2754         # The standard form is the value itself if the standard form is
2755         # undefined (that is if the value is special)
2756
2757         my $self = shift;
2758         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2759
2760         my $addr = do { no overloading; pack 'J', $self; };
2761
2762         return $standard_form{$addr} if defined $standard_form{$addr};
2763         return $value{$addr};
2764     }
2765
2766     sub dump {
2767         # Human, not machine readable.  For machine readable, comment out this
2768         # entire routine and let the standard one take effect.
2769         my $self = shift;
2770         my $indent = shift;
2771         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2772
2773         my $addr = do { no overloading; pack 'J', $self; };
2774
2775         my $return = $indent
2776                     . sprintf("%04X", $start{$addr})
2777                     . '..'
2778                     . sprintf("%04X", $end{$addr})
2779                     . " '$value{$addr}';";
2780         if (! defined $standard_form{$addr}) {
2781             $return .= "(type=$type{$addr})";
2782         }
2783         elsif ($standard_form{$addr} ne $value{$addr}) {
2784             $return .= "(standard '$standard_form{$addr}')";
2785         }
2786         return $return;
2787     }
2788 } # End closure
2789
2790 package _Range_List_Base;
2791
2792 # Base class for range lists.  A range list is simply an ordered list of
2793 # ranges, so that the ranges with the lowest starting numbers are first in it.
2794 #
2795 # When a new range is added that is adjacent to an existing range that has the
2796 # same value and type, it merges with it to form a larger range.
2797 #
2798 # Ranges generally do not overlap, except that there can be multiple entries
2799 # of single code point ranges.  This is because of NameAliases.txt.
2800 #
2801 # In this program, there is a standard value such that if two different
2802 # values, have the same standard value, they are considered equivalent.  This
2803 # value was chosen so that it gives correct results on Unicode data
2804
2805 # There are a number of methods to manipulate range lists, and some operators
2806 # are overloaded to handle them.
2807
2808 sub trace { return main::trace(@_); }
2809
2810 { # Closure
2811
2812     our $addr;
2813
2814     main::setup_package();
2815
2816     my %ranges;
2817     # The list of ranges
2818     main::set_access('ranges', \%ranges, 'readable_array');
2819
2820     my %max;
2821     # The highest code point in the list.  This was originally a method, but
2822     # actual measurements said it was used a lot.
2823     main::set_access('max', \%max, 'r');
2824
2825     my %each_range_iterator;
2826     # Iterator position for each_range()
2827     main::set_access('each_range_iterator', \%each_range_iterator);
2828
2829     my %owner_name_of;
2830     # Name of parent this is attached to, if any.  Solely for better error
2831     # messages.
2832     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2833
2834     my %_search_ranges_cache;
2835     # A cache of the previous result from _search_ranges(), for better
2836     # performance
2837     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2838
2839     sub new {
2840         my $class = shift;
2841         my %args = @_;
2842
2843         # Optional initialization data for the range list.
2844         my $initialize = delete $args{'Initialize'};
2845
2846         my $self;
2847
2848         # Use _union() to initialize.  _union() returns an object of this
2849         # class, which means that it will call this constructor recursively.
2850         # But it won't have this $initialize parameter so that it won't
2851         # infinitely loop on this.
2852         return _union($class, $initialize, %args) if defined $initialize;
2853
2854         $self = bless \do { my $anonymous_scalar }, $class;
2855         my $addr = do { no overloading; pack 'J', $self; };
2856
2857         # Optional parent object, only for debug info.
2858         $owner_name_of{$addr} = delete $args{'Owner'};
2859         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2860
2861         # Stringify, in case it is an object.
2862         $owner_name_of{$addr} = "$owner_name_of{$addr}";
2863
2864         # This is used only for error messages, and so a colon is added
2865         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2866
2867         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2868
2869         # Max is initialized to a negative value that isn't adjacent to 0,
2870         # for simpler tests
2871         $max{$addr} = -2;
2872
2873         $_search_ranges_cache{$addr} = 0;
2874         $ranges{$addr} = [];
2875
2876         return $self;
2877     }
2878
2879     use overload
2880         fallback => 0,
2881         qw("") => "_operator_stringify",
2882         "." => \&main::_operator_dot,
2883     ;
2884
2885     sub _operator_stringify {
2886         my $self = shift;
2887         my $addr = do { no overloading; pack 'J', $self; };
2888
2889         return "Range_List attached to '$owner_name_of{$addr}'"
2890                                                 if $owner_name_of{$addr};
2891         return "anonymous Range_List " . \$self;
2892     }
2893
2894     sub _union {
2895         # Returns the union of the input code points.  It can be called as
2896         # either a constructor or a method.  If called as a method, the result
2897         # will be a new() instance of the calling object, containing the union
2898         # of that object with the other parameter's code points;  if called as
2899         # a constructor, the first parameter gives the class the new object
2900         # should be, and the second parameter gives the code points to go into
2901         # it.
2902         # In either case, there are two parameters looked at by this routine;
2903         # any additional parameters are passed to the new() constructor.
2904         #
2905         # The code points can come in the form of some object that contains
2906         # ranges, and has a conventionally named method to access them; or
2907         # they can be an array of individual code points (as integers); or
2908         # just a single code point.
2909         #
2910         # If they are ranges, this routine doesn't make any effort to preserve
2911         # the range values of one input over the other.  Therefore this base
2912         # class should not allow _union to be called from other than
2913         # initialization code, so as to prevent two tables from being added
2914         # together where the range values matter.  The general form of this
2915         # routine therefore belongs in a derived class, but it was moved here
2916         # to avoid duplication of code.  The failure to overload this in this
2917         # class keeps it safe.
2918         #
2919
2920         my $self;
2921         my @args;   # Arguments to pass to the constructor
2922
2923         my $class = shift;
2924
2925         # If a method call, will start the union with the object itself, and
2926         # the class of the new object will be the same as self.
2927         if (ref $class) {
2928             $self = $class;
2929             $class = ref $self;
2930             push @args, $self;
2931         }
2932
2933         # Add the other required parameter.
2934         push @args, shift;
2935         # Rest of parameters are passed on to the constructor
2936
2937         # Accumulate all records from both lists.
2938         my @records;
2939         for my $arg (@args) {
2940             #local $to_trace = 0 if main::DEBUG;
2941             trace "argument = $arg" if main::DEBUG && $to_trace;
2942             if (! defined $arg) {
2943                 my $message = "";
2944                 if (defined $self) {
2945                     no overloading;
2946                     $message .= $owner_name_of{pack 'J', $self};
2947                 }
2948                 Carp::my_carp_bug($message .= "Undefined argument to _union.  No union done.");
2949                 return;
2950             }
2951             $arg = [ $arg ] if ! ref $arg;
2952             my $type = ref $arg;
2953             if ($type eq 'ARRAY') {
2954                 foreach my $element (@$arg) {
2955                     push @records, Range->new($element, $element);
2956                 }
2957             }
2958             elsif ($arg->isa('Range')) {
2959                 push @records, $arg;
2960             }
2961             elsif ($arg->can('ranges')) {
2962                 push @records, $arg->ranges;
2963             }
2964             else {
2965                 my $message = "";
2966                 if (defined $self) {
2967                     no overloading;
2968                     $message .= $owner_name_of{pack 'J', $self};
2969                 }
2970                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
2971                 return;
2972             }
2973         }
2974
2975         # Sort with the range containing the lowest ordinal first, but if
2976         # two ranges start at the same code point, sort with the bigger range
2977         # of the two first, because it takes fewer cycles.
2978         @records = sort { ($a->start <=> $b->start)
2979                                       or
2980                                     # if b is shorter than a, b->end will be
2981                                     # less than a->end, and we want to select
2982                                     # a, so want to return -1
2983                                     ($b->end <=> $a->end)
2984                                    } @records;
2985
2986         my $new = $class->new(@_);
2987
2988         # Fold in records so long as they add new information.
2989         for my $set (@records) {
2990             my $start = $set->start;
2991             my $end   = $set->end;
2992             my $value   = $set->value;
2993             if ($start > $new->max) {
2994                 $new->_add_delete('+', $start, $end, $value);
2995             }
2996             elsif ($end > $new->max) {
2997                 $new->_add_delete('+', $new->max +1, $end, $value);
2998             }
2999         }
3000
3001         return $new;
3002     }
3003
3004     sub range_count {        # Return the number of ranges in the range list
3005         my $self = shift;
3006         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3007
3008         no overloading;
3009         return scalar @{$ranges{pack 'J', $self}};
3010     }
3011
3012     sub min {
3013         # Returns the minimum code point currently in the range list, or if
3014         # the range list is empty, 2 beyond the max possible.  This is a
3015         # method because used so rarely, that not worth saving between calls,
3016         # and having to worry about changing it as ranges are added and
3017         # deleted.
3018
3019         my $self = shift;
3020         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3021
3022         my $addr = do { no overloading; pack 'J', $self; };
3023
3024         # If the range list is empty, return a large value that isn't adjacent
3025         # to any that could be in the range list, for simpler tests
3026         return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3027         return $ranges{$addr}->[0]->start;
3028     }
3029
3030     sub contains {
3031         # Boolean: Is argument in the range list?  If so returns $i such that:
3032         #   range[$i]->end < $codepoint <= range[$i+1]->end
3033         # which is one beyond what you want; this is so that the 0th range
3034         # doesn't return false
3035         my $self = shift;
3036         my $codepoint = shift;
3037         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3038
3039         my $i = $self->_search_ranges($codepoint);
3040         return 0 unless defined $i;
3041
3042         # The search returns $i, such that
3043         #   range[$i-1]->end < $codepoint <= range[$i]->end
3044         # So is in the table if and only iff it is at least the start position
3045         # of range $i.
3046         no overloading;
3047         return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3048         return $i + 1;
3049     }
3050
3051     sub containing_range {
3052         # Returns the range object that contains the code point, undef if none
3053
3054         my $self = shift;
3055         my $codepoint = shift;
3056         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3057
3058         my $i = $self->contains($codepoint);
3059         return unless $i;
3060
3061         # contains() returns 1 beyond where we should look
3062         no overloading;
3063         return $ranges{pack 'J', $self}->[$i-1];
3064     }
3065
3066     sub value_of {
3067         # Returns the value associated with the code point, undef if none
3068
3069         my $self = shift;
3070         my $codepoint = shift;
3071         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3072
3073         my $range = $self->containing_range($codepoint);
3074         return unless defined $range;
3075
3076         return $range->value;
3077     }
3078
3079     sub type_of {
3080         # Returns the type of the range containing the code point, undef if
3081         # the code point is not in the table
3082
3083         my $self = shift;
3084         my $codepoint = shift;
3085         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3086
3087         my $range = $self->containing_range($codepoint);
3088         return unless defined $range;
3089
3090         return $range->type;
3091     }
3092
3093     sub _search_ranges {
3094         # Find the range in the list which contains a code point, or where it
3095         # should go if were to add it.  That is, it returns $i, such that:
3096         #   range[$i-1]->end < $codepoint <= range[$i]->end
3097         # Returns undef if no such $i is possible (e.g. at end of table), or
3098         # if there is an error.
3099
3100         my $self = shift;
3101         my $code_point = shift;
3102         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3103
3104         my $addr = do { no overloading; pack 'J', $self; };
3105
3106         return if $code_point > $max{$addr};
3107         my $r = $ranges{$addr};                # The current list of ranges
3108         my $range_list_size = scalar @$r;
3109         my $i;
3110
3111         use integer;        # want integer division
3112
3113         # Use the cached result as the starting guess for this one, because,
3114         # an experiment on 5.1 showed that 90% of the time the cache was the
3115         # same as the result on the next call (and 7% it was one less).
3116         $i = $_search_ranges_cache{$addr};
3117         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3118                                             # from an intervening deletion
3119         #local $to_trace = 1 if main::DEBUG;
3120         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3121         return $i if $code_point <= $r->[$i]->end
3122                      && ($i == 0 || $r->[$i-1]->end < $code_point);
3123
3124         # Here the cache doesn't yield the correct $i.  Try adding 1.
3125         if ($i < $range_list_size - 1
3126             && $r->[$i]->end < $code_point &&
3127             $code_point <= $r->[$i+1]->end)
3128         {
3129             $i++;
3130             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3131             $_search_ranges_cache{$addr} = $i;
3132             return $i;
3133         }
3134
3135         # Here, adding 1 also didn't work.  We do a binary search to
3136         # find the correct position, starting with current $i
3137         my $lower = 0;
3138         my $upper = $range_list_size - 1;
3139         while (1) {
3140             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3141
3142             if ($code_point <= $r->[$i]->end) {
3143
3144                 # Here we have met the upper constraint.  We can quit if we
3145                 # also meet the lower one.
3146                 last if $i == 0 || $r->[$i-1]->end < $code_point;
3147
3148                 $upper = $i;        # Still too high.
3149
3150             }
3151             else {
3152
3153                 # Here, $r[$i]->end < $code_point, so look higher up.
3154                 $lower = $i;
3155             }
3156
3157             # Split search domain in half to try again.
3158             my $temp = ($upper + $lower) / 2;
3159
3160             # No point in continuing unless $i changes for next time
3161             # in the loop.
3162             if ($temp == $i) {
3163
3164                 # We can't reach the highest element because of the averaging.
3165                 # So if one below the upper edge, force it there and try one
3166                 # more time.
3167                 if ($i == $range_list_size - 2) {
3168
3169                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3170                     $i = $range_list_size - 1;
3171
3172                     # Change $lower as well so if fails next time through,
3173                     # taking the average will yield the same $i, and we will
3174                     # quit with the error message just below.
3175                     $lower = $i;
3176                     next;
3177                 }
3178                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
3179                 return;
3180             }
3181             $i = $temp;
3182         } # End of while loop
3183
3184         if (main::DEBUG && $to_trace) {
3185             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3186             trace "i=  [ $i ]", $r->[$i];
3187             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3188         }
3189
3190         # Here we have found the offset.  Cache it as a starting point for the
3191         # next call.
3192         $_search_ranges_cache{$addr} = $i;
3193         return $i;
3194     }
3195
3196     sub _add_delete {
3197         # Add, replace or delete ranges to or from a list.  The $type
3198         # parameter gives which:
3199         #   '+' => insert or replace a range, returning a list of any changed
3200         #          ranges.
3201         #   '-' => delete a range, returning a list of any deleted ranges.
3202         #
3203         # The next three parameters give respectively the start, end, and
3204         # value associated with the range.  'value' should be null unless the
3205         # operation is '+';
3206         #
3207         # The range list is kept sorted so that the range with the lowest
3208         # starting position is first in the list, and generally, adjacent
3209         # ranges with the same values are merged into a single larger one (see
3210         # exceptions below).
3211         #
3212         # There are more parameters; all are key => value pairs:
3213         #   Type    gives the type of the value.  It is only valid for '+'.
3214         #           All ranges have types; if this parameter is omitted, 0 is
3215         #           assumed.  Ranges with type 0 are assumed to obey the
3216         #           Unicode rules for casing, etc; ranges with other types are
3217         #           not.  Otherwise, the type is arbitrary, for the caller's
3218         #           convenience, and looked at only by this routine to keep
3219         #           adjacent ranges of different types from being merged into
3220         #           a single larger range, and when Replace =>
3221         #           $IF_NOT_EQUIVALENT is specified (see just below).
3222         #   Replace  determines what to do if the range list already contains
3223         #            ranges which coincide with all or portions of the input
3224         #            range.  It is only valid for '+':
3225         #       => $NO            means that the new value is not to replace
3226         #                         any existing ones, but any empty gaps of the
3227         #                         range list coinciding with the input range
3228         #                         will be filled in with the new value.
3229         #       => $UNCONDITIONALLY  means to replace the existing values with
3230         #                         this one unconditionally.  However, if the
3231         #                         new and old values are identical, the
3232         #                         replacement is skipped to save cycles
3233         #       => $IF_NOT_EQUIVALENT means to replace the existing values
3234         #                         with this one if they are not equivalent.
3235         #                         Ranges are equivalent if their types are the
3236         #                         same, and they are the same string; or if
3237         #                         both are type 0 ranges, if their Unicode
3238         #                         standard forms are identical.  In this last
3239         #                         case, the routine chooses the more "modern"
3240         #                         one to use.  This is because some of the
3241         #                         older files are formatted with values that
3242         #                         are, for example, ALL CAPs, whereas the
3243         #                         derived files have a more modern style,
3244         #                         which looks better.  By looking for this
3245         #                         style when the pre-existing and replacement
3246         #                         standard forms are the same, we can move to
3247         #                         the modern style
3248         #       => $MULTIPLE      means that if this range duplicates an
3249         #                         existing one, but has a different value,
3250         #                         don't replace the existing one, but insert
3251         #                         this, one so that the same range can occur
3252         #                         multiple times.  They are stored LIFO, so
3253         #                         that the final one inserted is the first one
3254         #                         returned in an ordered search of the table.
3255         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
3256         #
3257         # "same value" means identical for non-type-0 ranges, and it means
3258         # having the same standard forms for type-0 ranges.
3259
3260         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3261
3262         my $self = shift;
3263         my $operation = shift;   # '+' for add/replace; '-' for delete;
3264         my $start = shift;
3265         my $end   = shift;
3266         my $value = shift;
3267
3268         my %args = @_;
3269
3270         $value = "" if not defined $value;        # warning: $value can be "0"
3271
3272         my $replace = delete $args{'Replace'};
3273         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3274
3275         my $type = delete $args{'Type'};
3276         $type = 0 unless defined $type;
3277
3278         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3279
3280         my $addr = do { no overloading; pack 'J', $self; };
3281
3282         if ($operation ne '+' && $operation ne '-') {
3283             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
3284             return;
3285         }
3286         unless (defined $start && defined $end) {
3287             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
3288             return;
3289         }
3290         unless ($end >= $start) {
3291             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
3292             return;
3293         }
3294         #local $to_trace = 1 if main::DEBUG;
3295
3296         if ($operation eq '-') {
3297             if ($replace != $IF_NOT_EQUIVALENT) {
3298                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
3299                 $replace = $IF_NOT_EQUIVALENT;
3300             }
3301             if ($type) {
3302                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
3303                 $type = 0;
3304             }
3305             if ($value ne "") {
3306                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
3307                 $value = "";
3308             }
3309         }
3310
3311         my $r = $ranges{$addr};               # The current list of ranges
3312         my $range_list_size = scalar @$r;     # And its size
3313         my $max = $max{$addr};                # The current high code point in
3314                                               # the list of ranges
3315
3316         # Do a special case requiring fewer machine cycles when the new range
3317         # starts after the current highest point.  The Unicode input data is
3318         # structured so this is common.
3319         if ($start > $max) {
3320
3321             trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3322             return if $operation eq '-'; # Deleting a non-existing range is a
3323                                          # no-op
3324
3325             # If the new range doesn't logically extend the current final one
3326             # in the range list, create a new range at the end of the range
3327             # list.  (max cleverly is initialized to a negative number not
3328             # adjacent to 0 if the range list is empty, so even adding a range
3329             # to an empty range list starting at 0 will have this 'if'
3330             # succeed.)
3331             if ($start > $max + 1        # non-adjacent means can't extend.
3332                 || @{$r}[-1]->value ne $value # values differ, can't extend.
3333                 || @{$r}[-1]->type != $type # types differ, can't extend.
3334             ) {
3335                 push @$r, Range->new($start, $end,
3336                                      Value => $value,
3337                                      Type => $type);
3338             }
3339             else {
3340
3341                 # Here, the new range starts just after the current highest in
3342                 # the range list, and they have the same type and value.
3343                 # Extend the current range to incorporate the new one.
3344                 @{$r}[-1]->set_end($end);
3345             }
3346
3347             # This becomes the new maximum.
3348             $max{$addr} = $end;
3349
3350             return;
3351         }
3352         #local $to_trace = 0 if main::DEBUG;
3353
3354         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3355
3356         # Here, the input range isn't after the whole rest of the range list.
3357         # Most likely 'splice' will be needed.  The rest of the routine finds
3358         # the needed splice parameters, and if necessary, does the splice.
3359         # First, find the offset parameter needed by the splice function for
3360         # the input range.  Note that the input range may span multiple
3361         # existing ones, but we'll worry about that later.  For now, just find
3362         # the beginning.  If the input range is to be inserted starting in a
3363         # position not currently in the range list, it must (obviously) come
3364         # just after the range below it, and just before the range above it.
3365         # Slightly less obviously, it will occupy the position currently
3366         # occupied by the range that is to come after it.  More formally, we
3367         # are looking for the position, $i, in the array of ranges, such that:
3368         #
3369         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3370         #
3371         # (The ordered relationships within existing ranges are also shown in
3372         # the equation above).  However, if the start of the input range is
3373         # within an existing range, the splice offset should point to that
3374         # existing range's position in the list; that is $i satisfies a
3375         # somewhat different equation, namely:
3376         #
3377         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3378         #
3379         # More briefly, $start can come before or after r[$i]->start, and at
3380         # this point, we don't know which it will be.  However, these
3381         # two equations share these constraints:
3382         #
3383         #   r[$i-1]->end < $start <= r[$i]->end
3384         #
3385         # And that is good enough to find $i.
3386
3387         my $i = $self->_search_ranges($start);
3388         if (! defined $i) {
3389             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
3390             return;
3391         }
3392
3393         # The search function returns $i such that:
3394         #
3395         # r[$i-1]->end < $start <= r[$i]->end
3396         #
3397         # That means that $i points to the first range in the range list
3398         # that could possibly be affected by this operation.  We still don't
3399         # know if the start of the input range is within r[$i], or if it
3400         # points to empty space between r[$i-1] and r[$i].
3401         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3402
3403         # Special case the insertion of data that is not to replace any
3404         # existing data.
3405         if ($replace == $NO) {  # If $NO, has to be operation '+'
3406             #local $to_trace = 1 if main::DEBUG;
3407             trace "Doesn't replace" if main::DEBUG && $to_trace;
3408
3409             # Here, the new range is to take effect only on those code points
3410             # that aren't already in an existing range.  This can be done by
3411             # looking through the existing range list and finding the gaps in
3412             # the ranges that this new range affects, and then calling this
3413             # function recursively on each of those gaps, leaving untouched
3414             # anything already in the list.  Gather up a list of the changed
3415             # gaps first so that changes to the internal state as new ranges
3416             # are added won't be a problem.
3417             my @gap_list;
3418
3419             # First, if the starting point of the input range is outside an
3420             # existing one, there is a gap from there to the beginning of the
3421             # existing range -- add a span to fill the part that this new
3422             # range occupies
3423             if ($start < $r->[$i]->start) {
3424                 push @gap_list, Range->new($start,
3425                                            main::min($end,
3426                                                      $r->[$i]->start - 1),
3427                                            Type => $type);
3428                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3429             }
3430
3431             # Then look through the range list for other gaps until we reach
3432             # the highest range affected by the input one.
3433             my $j;
3434             for ($j = $i+1; $j < $range_list_size; $j++) {
3435                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3436                 last if $end < $r->[$j]->start;
3437
3438                 # If there is a gap between when this range starts and the
3439                 # previous one ends, add a span to fill it.  Note that just
3440                 # because there are two ranges doesn't mean there is a
3441                 # non-zero gap between them.  It could be that they have
3442                 # different values or types
3443                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3444                     push @gap_list,
3445                         Range->new($r->[$j-1]->end + 1,
3446                                    $r->[$j]->start - 1,
3447                                    Type => $type);
3448                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3449                 }
3450             }
3451
3452             # Here, we have either found an existing range in the range list,
3453             # beyond the area affected by the input one, or we fell off the
3454             # end of the loop because the input range affects the whole rest
3455             # of the range list.  In either case, $j is 1 higher than the
3456             # highest affected range.  If $j == $i, it means that there are no
3457             # affected ranges, that the entire insertion is in the gap between
3458             # r[$i-1], and r[$i], which we already have taken care of before
3459             # the loop.
3460             # On the other hand, if there are affected ranges, it might be
3461             # that there is a gap that needs filling after the final such
3462             # range to the end of the input range
3463             if ($r->[$j-1]->end < $end) {
3464                     push @gap_list, Range->new(main::max($start,
3465                                                          $r->[$j-1]->end + 1),
3466                                                $end,
3467                                                Type => $type);
3468                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3469             }
3470
3471             # Call recursively to fill in all the gaps.
3472             foreach my $gap (@gap_list) {
3473                 $self->_add_delete($operation,
3474                                    $gap->start,
3475                                    $gap->end,
3476                                    $value,
3477                                    Type => $type);
3478             }
3479
3480             return;
3481         }
3482
3483         # Here, we have taken care of the case where $replace is $NO.
3484         # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3485         # If inserting a multiple record, this is where it goes, before the
3486         # first (if any) existing one.  This implies an insertion, and no
3487         # change to any existing ranges.  Note that $i can be -1 if this new
3488         # range doesn't actually duplicate any existing, and comes at the
3489         # beginning of the list.
3490         if ($replace == $MULTIPLE) {
3491
3492             if ($start != $end) {
3493                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
3494                 return;
3495             }
3496
3497             # Don't add an exact duplicate, as it isn't really a multiple
3498             if ($end >= $r->[$i]->start) {
3499                 my $existing_value = $r->[$i]->value;
3500                 my $existing_type = $r->[$i]->type;
3501                 return if $value eq $existing_value && $type eq $existing_type;
3502
3503                 # If the multiple value is part of an existing range, we want
3504                 # to split up that range, so that only the single code point
3505                 # is affected.  To do this, we first call ourselves
3506                 # recursively to delete that code point from the table, having
3507                 # preserved its current data above.  Then we call ourselves
3508                 # recursively again to add the new multiple, which we know by
3509                 # the test just above is different than the current code
3510                 # point's value, so it will become a range containing a single
3511                 # code point: just itself.  Finally, we add back in the
3512                 # pre-existing code point, which will again be a single code
3513                 # point range.  Because 'i' likely will have changed as a
3514                 # result of these operations, we can't just continue on, but
3515                 # do this operation recursively as well.
3516                 if ($r->[$i]->start != $r->[$i]->end) {
3517                     $self->_add_delete('-', $start, $end, "");
3518                     $self->_add_delete('+', $start, $end, $value, Type => $type);
3519                     return $self->_add_delete('+', $start, $end, $existing_value, Type => $existing_type, Replace => $MULTIPLE);
3520                 }
3521             }
3522
3523             trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3524             my @return = splice @$r,
3525                                 $i,
3526                                 0,
3527                                 Range->new($start,
3528                                            $end,
3529                                            Value => $value,
3530                                            Type => $type);
3531             if (main::DEBUG && $to_trace) {
3532                 trace "After splice:";
3533                 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3534                 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3535                 trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
3536                 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3537                 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3538                 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3539             }
3540             return @return;
3541         }
3542
3543         # Here, we have taken care of $NO and $MULTIPLE replaces.  This leaves
3544         # delete, insert, and replace either unconditionally or if not
3545         # equivalent.  $i still points to the first potential affected range.
3546         # Now find the highest range affected, which will determine the length
3547         # parameter to splice.  (The input range can span multiple existing
3548         # ones.)  If this isn't a deletion, while we are looking through the
3549         # range list, see also if this is a replacement rather than a clean
3550         # insertion; that is if it will change the values of at least one
3551         # existing range.  Start off assuming it is an insert, until find it
3552         # isn't.
3553         my $clean_insert = $operation eq '+';
3554         my $j;        # This will point to the highest affected range
3555
3556         # For non-zero types, the standard form is the value itself;
3557         my $standard_form = ($type) ? $value : main::standardize($value);
3558
3559         for ($j = $i; $j < $range_list_size; $j++) {
3560             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3561
3562             # If find a range that it doesn't overlap into, we can stop
3563             # searching
3564             last if $end < $r->[$j]->start;
3565
3566             # Here, overlaps the range at $j.  If the values don't match,
3567             # and so far we think this is a clean insertion, it becomes a
3568             # non-clean insertion, i.e., a 'change' or 'replace' instead.
3569             if ($clean_insert) {
3570                 if ($r->[$j]->standard_form ne $standard_form) {
3571                     $clean_insert = 0;
3572                     if ($replace == $CROAK) {
3573                         main::croak("The range to add "
3574                         . sprintf("%04X", $start)
3575                         . '-'
3576                         . sprintf("%04X", $end)
3577                         . " with value '$value' overlaps an existing range $r->[$j]");
3578                     }
3579                 }
3580                 else {
3581
3582                     # Here, the two values are essentially the same.  If the
3583                     # two are actually identical, replacing wouldn't change
3584                     # anything so skip it.
3585                     my $pre_existing = $r->[$j]->value;
3586                     if ($pre_existing ne $value) {
3587
3588                         # Here the new and old standardized values are the
3589                         # same, but the non-standardized values aren't.  If
3590                         # replacing unconditionally, then replace
3591                         if( $replace == $UNCONDITIONALLY) {
3592                             $clean_insert = 0;
3593                         }
3594                         else {
3595
3596                             # Here, are replacing conditionally.  Decide to
3597                             # replace or not based on which appears to look
3598                             # the "nicest".  If one is mixed case and the
3599                             # other isn't, choose the mixed case one.
3600                             my $new_mixed = $value =~ /[A-Z]/
3601                                             && $value =~ /[a-z]/;
3602                             my $old_mixed = $pre_existing =~ /[A-Z]/
3603                                             && $pre_existing =~ /[a-z]/;
3604
3605                             if ($old_mixed != $new_mixed) {
3606                                 $clean_insert = 0 if $new_mixed;
3607                                 if (main::DEBUG && $to_trace) {
3608                                     if ($clean_insert) {
3609                                         trace "Retaining $pre_existing over $value";
3610                                     }
3611                                     else {
3612                                         trace "Replacing $pre_existing with $value";
3613                                     }
3614                                 }
3615                             }
3616                             else {
3617
3618                                 # Here casing wasn't different between the two.
3619                                 # If one has hyphens or underscores and the
3620                                 # other doesn't, choose the one with the
3621                                 # punctuation.
3622                                 my $new_punct = $value =~ /[-_]/;
3623                                 my $old_punct = $pre_existing =~ /[-_]/;
3624
3625                                 if ($old_punct != $new_punct) {
3626                                     $clean_insert = 0 if $new_punct;
3627                                     if (main::DEBUG && $to_trace) {
3628                                         if ($clean_insert) {
3629                                             trace "Retaining $pre_existing over $value";
3630                                         }
3631                                         else {
3632                                             trace "Replacing $pre_existing with $value";
3633                                         }
3634                                     }
3635                                 }   # else existing one is just as "good";
3636                                     # retain it to save cycles.
3637                             }
3638                         }
3639                     }
3640                 }
3641             }
3642         } # End of loop looking for highest affected range.
3643
3644         # Here, $j points to one beyond the highest range that this insertion
3645         # affects (hence to beyond the range list if that range is the final
3646         # one in the range list).
3647
3648         # The splice length is all the affected ranges.  Get it before
3649         # subtracting, for efficiency, so we don't have to later add 1.
3650         my $length = $j - $i;
3651
3652         $j--;        # $j now points to the highest affected range.
3653         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3654
3655         # Here, have taken care of $NO and $MULTIPLE replaces.
3656         # $j points to the highest affected range.  But it can be < $i or even
3657         # -1.  These happen only if the insertion is entirely in the gap
3658         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
3659         # above exited first time through with $end < $r->[$i]->start.  (And
3660         # then we subtracted one from j)  This implies also that $start <
3661         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3662         # $start, so the entire input range is in the gap.
3663         if ($j < $i) {
3664
3665             # Here the entire input range is in the gap before $i.
3666
3667             if (main::DEBUG && $to_trace) {
3668                 if ($i) {
3669                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
3670                 }
3671                 else {
3672                     trace "Entire range is before $r->[$i]";
3673                 }
3674             }
3675             return if $operation ne '+'; # Deletion of a non-existent range is
3676                                          # a no-op
3677         }
3678         else {
3679
3680             # Here part of the input range is not in the gap before $i.  Thus,
3681             # there is at least one affected one, and $j points to the highest
3682             # such one.
3683
3684             # At this point, here is the situation:
3685             # This is not an insertion of a multiple, nor of tentative ($NO)
3686             # data.
3687             #   $i  points to the first element in the current range list that
3688             #            may be affected by this operation.  In fact, we know
3689             #            that the range at $i is affected because we are in
3690             #            the else branch of this 'if'
3691             #   $j  points to the highest affected range.
3692             # In other words,
3693             #   r[$i-1]->end < $start <= r[$i]->end
3694             # And:
3695             #   r[$i-1]->end < $start <= $end <= r[$j]->end
3696             #
3697             # Also:
3698             #   $clean_insert is a boolean which is set true if and only if
3699             #        this is a "clean insertion", i.e., not a change nor a
3700             #        deletion (multiple was handled above).
3701
3702             # We now have enough information to decide if this call is a no-op
3703             # or not.  It is a no-op if this is an insertion of already
3704             # existing data.
3705
3706             if (main::DEBUG && $to_trace && $clean_insert
3707                                          && $i == $j
3708                                          && $start >= $r->[$i]->start)
3709             {
3710                     trace "no-op";
3711             }
3712             return if $clean_insert
3713                       && $i == $j # more than one affected range => not no-op
3714
3715                       # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3716                       # Further, $start and/or $end is >= r[$i]->start
3717                       # The test below hence guarantees that
3718                       #     r[$i]->start < $start <= $end <= r[$i]->end
3719                       # This means the input range is contained entirely in
3720                       # the one at $i, so is a no-op
3721                       && $start >= $r->[$i]->start;
3722         }
3723
3724         # Here, we know that some action will have to be taken.  We have
3725         # calculated the offset and length (though adjustments may be needed)
3726         # for the splice.  Now start constructing the replacement list.
3727         my @replacement;
3728         my $splice_start = $i;
3729
3730         my $extends_below;
3731         my $extends_above;
3732
3733         # See if should extend any adjacent ranges.
3734         if ($operation eq '-') { # Don't extend deletions
3735             $extends_below = $extends_above = 0;
3736         }
3737         else {  # Here, should extend any adjacent ranges.  See if there are
3738                 # any.
3739             $extends_below = ($i > 0
3740                             # can't extend unless adjacent
3741                             && $r->[$i-1]->end == $start -1
3742                             # can't extend unless are same standard value
3743                             && $r->[$i-1]->standard_form eq $standard_form
3744                             # can't extend unless share type
3745                             && $r->[$i-1]->type == $type);
3746             $extends_above = ($j+1 < $range_list_size
3747                             && $r->[$j+1]->start == $end +1
3748                             && $r->[$j+1]->standard_form eq $standard_form
3749                             && $r->[$j+1]->type == $type);
3750         }
3751         if ($extends_below && $extends_above) { # Adds to both
3752             $splice_start--;     # start replace at element below
3753             $length += 2;        # will replace on both sides
3754             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3755
3756             # The result will fill in any gap, replacing both sides, and
3757             # create one large range.
3758             @replacement = Range->new($r->[$i-1]->start,
3759                                       $r->[$j+1]->end,
3760                                       Value => $value,
3761                                       Type => $type);
3762         }
3763         else {
3764
3765             # Here we know that the result won't just be the conglomeration of
3766             # a new range with both its adjacent neighbors.  But it could
3767             # extend one of them.
3768
3769             if ($extends_below) {
3770
3771                 # Here the new element adds to the one below, but not to the
3772                 # one above.  If inserting, and only to that one range,  can
3773                 # just change its ending to include the new one.
3774                 if ($length == 0 && $clean_insert) {
3775                     $r->[$i-1]->set_end($end);
3776                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3777                     return;
3778                 }
3779                 else {
3780                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3781                     $splice_start--;        # start replace at element below
3782                     $length++;              # will replace the element below
3783                     $start = $r->[$i-1]->start;
3784                 }
3785             }
3786             elsif ($extends_above) {
3787
3788                 # Here the new element adds to the one above, but not below.
3789                 # Mirror the code above
3790                 if ($length == 0 && $clean_insert) {
3791                     $r->[$j+1]->set_start($start);
3792                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3793                     return;
3794                 }
3795                 else {
3796                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3797                     $length++;        # will replace the element above
3798                     $end = $r->[$j+1]->end;
3799                 }
3800             }
3801
3802             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3803
3804             # Finally, here we know there will have to be a splice.
3805             # If the change or delete affects only the highest portion of the
3806             # first affected range, the range will have to be split.  The
3807             # splice will remove the whole range, but will replace it by a new
3808             # range containing just the unaffected part.  So, in this case,
3809             # add to the replacement list just this unaffected portion.
3810             if (! $extends_below
3811                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3812             {
3813                 push @replacement,
3814                     Range->new($r->[$i]->start,
3815                                $start - 1,
3816                                Value => $r->[$i]->value,
3817                                Type => $r->[$i]->type);
3818             }
3819
3820             # In the case of an insert or change, but not a delete, we have to
3821             # put in the new stuff;  this comes next.
3822             if ($operation eq '+') {
3823                 push @replacement, Range->new($start,
3824                                               $end,
3825                                               Value => $value,
3826                                               Type => $type);
3827             }
3828
3829             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3830             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3831
3832             # And finally, if we're changing or deleting only a portion of the
3833             # highest affected range, it must be split, as the lowest one was.
3834             if (! $extends_above
3835                 && $j >= 0  # Remember that j can be -1 if before first
3836                             # current element
3837                 && $end >= $r->[$j]->start
3838                 && $end < $r->[$j]->end)
3839             {
3840                 push @replacement,
3841                     Range->new($end + 1,
3842                                $r->[$j]->end,
3843                                Value => $r->[$j]->value,
3844                                Type => $r->[$j]->type);
3845             }
3846         }
3847
3848         # And do the splice, as calculated above
3849         if (main::DEBUG && $to_trace) {
3850             trace "replacing $length element(s) at $i with ";
3851             foreach my $replacement (@replacement) {
3852                 trace "    $replacement";
3853             }
3854             trace "Before splice:";
3855             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3856             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3857             trace "i  =[", $i, "]", $r->[$i];
3858             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3859             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3860         }
3861
3862         my @return = splice @$r, $splice_start, $length, @replacement;
3863
3864         if (main::DEBUG && $to_trace) {
3865             trace "After splice:";
3866             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3867             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3868             trace "i  =[", $i, "]", $r->[$i];
3869             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3870             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3871             trace "removed ", @return if @return;
3872         }
3873
3874         # An actual deletion could have changed the maximum in the list.
3875         # There was no deletion if the splice didn't return something, but
3876         # otherwise recalculate it.  This is done too rarely to worry about
3877         # performance.
3878         if ($operation eq '-' && @return) {
3879             $max{$addr} = $r->[-1]->end;
3880         }
3881         return @return;
3882     }
3883
3884     sub reset_each_range {  # reset the iterator for each_range();
3885         my $self = shift;
3886         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3887
3888         no overloading;
3889         undef $each_range_iterator{pack 'J', $self};
3890         return;
3891     }
3892
3893     sub each_range {
3894         # Iterate over each range in a range list.  Results are undefined if
3895         # the range list is changed during the iteration.
3896
3897         my $self = shift;
3898         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3899
3900         my $addr = do { no overloading; pack 'J', $self; };
3901
3902         return if $self->is_empty;
3903
3904         $each_range_iterator{$addr} = -1
3905                                 if ! defined $each_range_iterator{$addr};
3906         $each_range_iterator{$addr}++;
3907         return $ranges{$addr}->[$each_range_iterator{$addr}]
3908                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
3909         undef $each_range_iterator{$addr};
3910         return;
3911     }
3912
3913     sub count {        # Returns count of code points in range list
3914         my $self = shift;
3915         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3916
3917         my $addr = do { no overloading; pack 'J', $self; };
3918
3919         my $count = 0;
3920         foreach my $range (@{$ranges{$addr}}) {
3921             $count += $range->end - $range->start + 1;
3922         }
3923         return $count;
3924     }
3925
3926     sub delete_range {    # Delete a range
3927         my $self = shift;
3928         my $start = shift;
3929         my $end = shift;
3930
3931         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3932
3933         return $self->_add_delete('-', $start, $end, "");
3934     }
3935
3936     sub is_empty { # Returns boolean as to if a range list is empty
3937         my $self = shift;
3938         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3939
3940         no overloading;
3941         return scalar @{$ranges{pack 'J', $self}} == 0;
3942     }
3943
3944     sub hash {
3945         # Quickly returns a scalar suitable for separating tables into
3946         # buckets, i.e. it is a hash function of the contents of a table, so
3947         # there are relatively few conflicts.
3948
3949         my $self = shift;
3950         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3951
3952         my $addr = do { no overloading; pack 'J', $self; };
3953
3954         # These are quickly computable.  Return looks like 'min..max;count'
3955         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3956     }
3957 } # End closure for _Range_List_Base
3958
3959 package Range_List;
3960 use base '_Range_List_Base';
3961
3962 # A Range_List is a range list for match tables; i.e. the range values are
3963 # not significant.  Thus a number of operations can be safely added to it,
3964 # such as inversion, intersection.  Note that union is also an unsafe
3965 # operation when range values are cared about, and that method is in the base
3966 # class, not here.  But things are set up so that that method is callable only
3967 # during initialization.  Only in this derived class, is there an operation
3968 # that combines two tables.  A Range_Map can thus be used to initialize a
3969 # Range_List, and its mappings will be in the list, but are not significant to
3970 # this class.
3971
3972 sub trace { return main::trace(@_); }
3973
3974 { # Closure
3975
3976     use overload
3977         fallback => 0,
3978         '+' => sub { my $self = shift;
3979                     my $other = shift;
3980
3981                     return $self->_union($other)
3982                 },
3983         '&' => sub { my $self = shift;
3984                     my $other = shift;
3985
3986                     return $self->_intersect($other, 0);
3987                 },
3988         '~' => "_invert",
3989         '-' => "_subtract",
3990     ;
3991
3992     sub _invert {
3993         # Returns a new Range_List that gives all code points not in $self.
3994
3995         my $self = shift;
3996
3997         my $new = Range_List->new;
3998
3999         # Go through each range in the table, finding the gaps between them
4000         my $max = -1;   # Set so no gap before range beginning at 0
4001         for my $range ($self->ranges) {
4002             my $start = $range->start;
4003             my $end   = $range->end;
4004
4005             # If there is a gap before this range, the inverse will contain
4006             # that gap.
4007             if ($start > $max + 1) {
4008                 $new->add_range($max + 1, $start - 1);
4009             }
4010             $max = $end;
4011         }
4012
4013         # And finally, add the gap from the end of the table to the max
4014         # possible code point
4015         if ($max < $LAST_UNICODE_CODEPOINT) {
4016             $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
4017         }
4018         return $new;
4019     }
4020
4021     sub _subtract {
4022         # Returns a new Range_List with the argument deleted from it.  The
4023         # argument can be a single code point, a range, or something that has
4024         # a range, with the _range_list() method on it returning them
4025
4026         my $self = shift;
4027         my $other = shift;
4028         my $reversed = shift;
4029         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4030
4031         if ($reversed) {
4032             Carp::my_carp_bug("Can't cope with a "
4033              .  __PACKAGE__
4034              . " being the second parameter in a '-'.  Subtraction ignored.");
4035             return $self;
4036         }
4037
4038         my $new = Range_List->new(Initialize => $self);
4039
4040         if (! ref $other) { # Single code point
4041             $new->delete_range($other, $other);
4042         }
4043         elsif ($other->isa('Range')) {
4044             $new->delete_range($other->start, $other->end);
4045         }
4046         elsif ($other->can('_range_list')) {
4047             foreach my $range ($other->_range_list->ranges) {
4048                 $new->delete_range($range->start, $range->end);
4049             }
4050         }
4051         else {
4052             Carp::my_carp_bug("Can't cope with a "
4053                         . ref($other)
4054                         . " argument to '-'.  Subtraction ignored."
4055                         );
4056             return $self;
4057         }
4058
4059         return $new;
4060     }
4061
4062     sub _intersect {
4063         # Returns either a boolean giving whether the two inputs' range lists
4064         # intersect (overlap), or a new Range_List containing the intersection
4065         # of the two lists.  The optional final parameter being true indicates
4066         # to do the check instead of the intersection.
4067
4068         my $a_object = shift;
4069         my $b_object = shift;
4070         my $check_if_overlapping = shift;
4071         $check_if_overlapping = 0 unless defined $check_if_overlapping;
4072         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4073
4074         if (! defined $b_object) {
4075             my $message = "";
4076             $message .= $a_object->_owner_name_of if defined $a_object;
4077             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4078             return;
4079         }
4080
4081         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4082         # Thus the intersection could be much more simply be written:
4083         #   return ~(~$a_object + ~$b_object);
4084         # But, this is slower, and when taking the inverse of a large
4085         # range_size_1 table, back when such tables were always stored that
4086         # way, it became prohibitively slow, hence the code was changed to the
4087         # below
4088
4089         if ($b_object->isa('Range')) {
4090             $b_object = Range_List->new(Initialize => $b_object,
4091                                         Owner => $a_object->_owner_name_of);
4092         }
4093         $b_object = $b_object->_range_list if $b_object->can('_range_list');
4094
4095         my @a_ranges = $a_object->ranges;
4096         my @b_ranges = $b_object->ranges;
4097
4098         #local $to_trace = 1 if main::DEBUG;
4099         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4100
4101         # Start with the first range in each list
4102         my $a_i = 0;
4103         my $range_a = $a_ranges[$a_i];
4104         my $b_i = 0;
4105         my $range_b = $b_ranges[$b_i];
4106
4107         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4108                                                 if ! $check_if_overlapping;
4109
4110         # If either list is empty, there is no intersection and no overlap
4111         if (! defined $range_a || ! defined $range_b) {
4112             return $check_if_overlapping ? 0 : $new;
4113         }
4114         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4115
4116         # Otherwise, must calculate the intersection/overlap.  Start with the
4117         # very first code point in each list
4118         my $a = $range_a->start;
4119         my $b = $range_b->start;
4120
4121         # Loop through all the ranges of each list; in each iteration, $a and
4122         # $b are the current code points in their respective lists
4123         while (1) {
4124
4125             # If $a and $b are the same code point, ...
4126             if ($a == $b) {
4127
4128                 # it means the lists overlap.  If just checking for overlap
4129                 # know the answer now,
4130                 return 1 if $check_if_overlapping;
4131
4132                 # The intersection includes this code point plus anything else
4133                 # common to both current ranges.
4134                 my $start = $a;
4135                 my $end = main::min($range_a->end, $range_b->end);
4136                 if (! $check_if_overlapping) {
4137                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4138                     $new->add_range($start, $end);
4139                 }
4140
4141                 # Skip ahead to the end of the current intersect
4142                 $a = $b = $end;
4143
4144                 # If the current intersect ends at the end of either range (as
4145                 # it must for at least one of them), the next possible one
4146                 # will be the beginning code point in it's list's next range.
4147                 if ($a == $range_a->end) {
4148                     $range_a = $a_ranges[++$a_i];
4149                     last unless defined $range_a;
4150                     $a = $range_a->start;
4151                 }
4152                 if ($b == $range_b->end) {
4153                     $range_b = $b_ranges[++$b_i];
4154                     last unless defined $range_b;
4155                     $b = $range_b->start;
4156                 }
4157
4158                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4159             }
4160             elsif ($a < $b) {
4161
4162                 # Not equal, but if the range containing $a encompasses $b,
4163                 # change $a to be the middle of the range where it does equal
4164                 # $b, so the next iteration will get the intersection
4165                 if ($range_a->end >= $b) {
4166                     $a = $b;
4167                 }
4168                 else {
4169
4170                     # Here, the current range containing $a is entirely below
4171                     # $b.  Go try to find a range that could contain $b.
4172                     $a_i = $a_object->_search_ranges($b);
4173
4174                     # If no range found, quit.
4175                     last unless defined $a_i;
4176
4177                     # The search returns $a_i, such that
4178                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4179                     # Set $a to the beginning of this new range, and repeat.
4180                     $range_a = $a_ranges[$a_i];
4181                     $a = $range_a->start;
4182                 }
4183             }
4184             else { # Here, $b < $a.
4185
4186                 # Mirror image code to the leg just above
4187                 if ($range_b->end >= $a) {
4188                     $b = $a;
4189                 }
4190                 else {
4191                     $b_i = $b_object->_search_ranges($a);
4192                     last unless defined $b_i;
4193                     $range_b = $b_ranges[$b_i];
4194                     $b = $range_b->start;
4195                 }
4196             }
4197         } # End of looping through ranges.
4198
4199         # Intersection fully computed, or now know that there is no overlap
4200         return $check_if_overlapping ? 0 : $new;
4201     }
4202
4203     sub overlaps {
4204         # Returns boolean giving whether the two arguments overlap somewhere
4205
4206         my $self = shift;
4207         my $other = shift;
4208         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4209
4210         return $self->_intersect($other, 1);
4211     }
4212
4213     sub add_range {
4214         # Add a range to the list.
4215
4216         my $self = shift;
4217         my $start = shift;
4218         my $end = shift;
4219         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4220
4221         return $self->_add_delete('+', $start, $end, "");
4222     }
4223
4224     sub matches_identically_to {
4225         # Return a boolean as to whether or not two Range_Lists match identical
4226         # sets of code points.
4227
4228         my $self = shift;
4229         my $other = shift;
4230         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4231
4232         # These are ordered in increasing real time to figure out (at least
4233         # until a patch changes that and doesn't change this)
4234         return 0 if $self->max != $other->max;
4235         return 0 if $self->min != $other->min;
4236         return 0 if $self->range_count != $other->range_count;
4237         return 0 if $self->count != $other->count;
4238
4239         # Here they could be identical because all the tests above passed.
4240         # The loop below is somewhat simpler since we know they have the same
4241         # number of elements.  Compare range by range, until reach the end or
4242         # find something that differs.
4243         my @a_ranges = $self->ranges;
4244         my @b_ranges = $other->ranges;
4245         for my $i (0 .. @a_ranges - 1) {
4246             my $a = $a_ranges[$i];
4247             my $b = $b_ranges[$i];
4248             trace "self $a; other $b" if main::DEBUG && $to_trace;
4249             return 0 if ! defined $b
4250                         || $a->start != $b->start
4251                         || $a->end != $b->end;
4252         }
4253         return 1;
4254     }
4255
4256     sub is_code_point_usable {
4257         # This used only for making the test script.  See if the input
4258         # proposed trial code point is one that Perl will handle.  If second
4259         # parameter is 0, it won't select some code points for various
4260         # reasons, noted below.
4261
4262         my $code = shift;
4263         my $try_hard = shift;
4264         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4265
4266         return 0 if $code < 0;                # Never use a negative
4267
4268         # shun null.  I'm (khw) not sure why this was done, but NULL would be
4269         # the character very frequently used.
4270         return $try_hard if $code == 0x0000;
4271
4272         # shun non-character code points.
4273         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4274         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4275
4276         return $try_hard if $code > $LAST_UNICODE_CODEPOINT;   # keep in range
4277         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4278
4279         return 1;
4280     }
4281
4282     sub get_valid_code_point {
4283         # Return a code point that's part of the range list.  Returns nothing
4284         # if the table is empty or we can't find a suitable code point.  This
4285         # used only for making the test script.
4286
4287         my $self = shift;
4288         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4289
4290         my $addr = do { no overloading; pack 'J', $self; };
4291
4292         # On first pass, don't choose less desirable code points; if no good
4293         # one is found, repeat, allowing a less desirable one to be selected.
4294         for my $try_hard (0, 1) {
4295
4296             # Look through all the ranges for a usable code point.
4297             for my $set ($self->ranges) {
4298
4299                 # Try the edge cases first, starting with the end point of the
4300                 # range.
4301                 my $end = $set->end;
4302                 return $end if is_code_point_usable($end, $try_hard);
4303
4304                 # End point didn't, work.  Start at the beginning and try
4305                 # every one until find one that does work.
4306                 for my $trial ($set->start .. $end - 1) {
4307                     return $trial if is_code_point_usable($trial, $try_hard);
4308                 }
4309             }
4310         }
4311         return ();  # If none found, give up.
4312     }
4313
4314     sub get_invalid_code_point {
4315         # Return a code point that's not part of the table.  Returns nothing
4316         # if the table covers all code points or a suitable code point can't
4317         # be found.  This used only for making the test script.
4318
4319         my $self = shift;
4320         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4321
4322         # Just find a valid code point of the inverse, if any.
4323         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4324     }
4325 } # end closure for Range_List
4326
4327 package Range_Map;
4328 use base '_Range_List_Base';
4329
4330 # A Range_Map is a range list in which the range values (called maps) are
4331 # significant, and hence shouldn't be manipulated by our other code, which
4332 # could be ambiguous or lose things.  For example, in taking the union of two
4333 # lists, which share code points, but which have differing values, which one
4334 # has precedence in the union?
4335 # It turns out that these operations aren't really necessary for map tables,
4336 # and so this class was created to make sure they aren't accidentally
4337 # applied to them.
4338
4339 { # Closure
4340
4341     sub add_map {
4342         # Add a range containing a mapping value to the list
4343
4344         my $self = shift;
4345         # Rest of parameters passed on
4346
4347         return $self->_add_delete('+', @_);
4348     }
4349
4350     sub add_duplicate {
4351         # Adds entry to a range list which can duplicate an existing entry
4352
4353         my $self = shift;
4354         my $code_point = shift;
4355         my $value = shift;
4356         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4357
4358         return $self->add_map($code_point, $code_point,
4359                                 $value, Replace => $MULTIPLE);
4360     }
4361 } # End of closure for package Range_Map
4362
4363 package _Base_Table;
4364
4365 # A table is the basic data structure that gets written out into a file for
4366 # use by the Perl core.  This is the abstract base class implementing the
4367 # common elements from the derived ones.  A list of the methods to be
4368 # furnished by an implementing class is just after the constructor.
4369
4370 sub standardize { return main::standardize($_[0]); }
4371 sub trace { return main::trace(@_); }
4372
4373 { # Closure
4374
4375     main::setup_package();
4376
4377     my %range_list;
4378     # Object containing the ranges of the table.
4379     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4380
4381     my %full_name;
4382     # The full table name.
4383     main::set_access('full_name', \%full_name, 'r');
4384
4385     my %name;
4386     # The table name, almost always shorter
4387     main::set_access('name', \%name, 'r');
4388
4389     my %short_name;
4390     # The shortest of all the aliases for this table, with underscores removed
4391     main::set_access('short_name', \%short_name);
4392
4393     my %nominal_short_name_length;
4394     # The length of short_name before removing underscores
4395     main::set_access('nominal_short_name_length',
4396                     \%nominal_short_name_length);
4397
4398     my %complete_name;
4399     # The complete name, including property.
4400     main::set_access('complete_name', \%complete_name, 'r');
4401
4402     my %property;
4403     # Parent property this table is attached to.
4404     main::set_access('property', \%property, 'r');
4405
4406     my %aliases;
4407     # Ordered list of alias objects of the table's name.  The first ones in
4408     # the list are output first in comments
4409     main::set_access('aliases', \%aliases, 'readable_array');
4410
4411     my %comment;
4412     # A comment associated with the table for human readers of the files
4413     main::set_access('comment', \%comment, 's');
4414
4415     my %description;
4416     # A comment giving a short description of the table's meaning for human
4417     # readers of the files.
4418     main::set_access('description', \%description, 'readable_array');
4419
4420     my %note;
4421     # A comment giving a short note about the table for human readers of the
4422     # files.
4423     main::set_access('note', \%note, 'readable_array');
4424
4425     my %internal_only;
4426     # Boolean; if set this table is for internal core Perl only use.
4427     main::set_access('internal_only', \%internal_only);
4428
4429     my %find_table_from_alias;
4430     # The parent property passes this pointer to a hash which this class adds
4431     # all its aliases to, so that the parent can quickly take an alias and
4432     # find this table.
4433     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4434
4435     my %locked;
4436     # After this table is made equivalent to another one; we shouldn't go
4437     # changing the contents because that could mean it's no longer equivalent
4438     main::set_access('locked', \%locked, 'r');
4439
4440     my %file_path;
4441     # This gives the final path to the file containing the table.  Each
4442     # directory in the path is an element in the array
4443     main::set_access('file_path', \%file_path, 'readable_array');
4444
4445     my %status;
4446     # What is the table's status, normal, $OBSOLETE, etc.  Enum
4447     main::set_access('status', \%status, 'r');
4448
4449     my %status_info;
4450     # A comment about its being obsolete, or whatever non normal status it has
4451     main::set_access('status_info', \%status_info, 'r');
4452
4453     my %caseless_equivalent;
4454     # The table this is equivalent to under /i matching, if any.
4455     main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4456
4457     my %range_size_1;
4458     # Is the table to be output with each range only a single code point?
4459     # This is done to avoid breaking existing code that may have come to rely
4460     # on this behavior in previous versions of this program.)
4461     main::set_access('range_size_1', \%range_size_1, 'r', 's');
4462
4463     my %perl_extension;
4464     # A boolean set iff this table is a Perl extension to the Unicode
4465     # standard.
4466     main::set_access('perl_extension', \%perl_extension, 'r');
4467
4468     my %output_range_counts;
4469     # A boolean set iff this table is to have comments written in the
4470     # output file that contain the number of code points in the range.
4471     # The constructor can override the global flag of the same name.
4472     main::set_access('output_range_counts', \%output_range_counts, 'r');
4473
4474     my %format;
4475     # The format of the entries of the table.  This is calculated from the
4476     # data in the table (or passed in the constructor).  This is an enum e.g.,
4477     # $STRING_FORMAT
4478     main::set_access('format', \%format, 'r', 'p_s');
4479
4480     sub new {
4481         # All arguments are key => value pairs, which you can see below, most
4482         # of which match fields documented above.  Otherwise: Pod_Entry,
4483         # Externally_Ok, and Fuzzy apply to the names of the table, and are
4484         # documented in the Alias package
4485
4486         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4487
4488         my $class = shift;
4489
4490         my $self = bless \do { my $anonymous_scalar }, $class;
4491         my $addr = do { no overloading; pack 'J', $self; };
4492
4493         my %args = @_;
4494
4495         $name{$addr} = delete $args{'Name'};
4496         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4497         $full_name{$addr} = delete $args{'Full_Name'};
4498         my $complete_name = $complete_name{$addr}
4499                           = delete $args{'Complete_Name'};
4500         $format{$addr} = delete $args{'Format'};
4501         $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
4502         $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4503         $property{$addr} = delete $args{'_Property'};
4504         $range_list{$addr} = delete $args{'_Range_List'};
4505         $status{$addr} = delete $args{'Status'} || $NORMAL;
4506         $status_info{$addr} = delete $args{'_Status_Info'} || "";
4507         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4508         $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
4509
4510         my $description = delete $args{'Description'};
4511         my $externally_ok = delete $args{'Externally_Ok'};
4512         my $loose_match = delete $args{'Fuzzy'};
4513         my $note = delete $args{'Note'};
4514         my $make_pod_entry = delete $args{'Pod_Entry'};
4515         my $perl_extension = delete $args{'Perl_Extension'};
4516
4517         # Shouldn't have any left over
4518         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4519
4520         # Can't use || above because conceivably the name could be 0, and
4521         # can't use // operator in case this program gets used in Perl 5.8
4522         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4523         $output_range_counts{$addr} = $output_range_counts if
4524                                         ! defined $output_range_counts{$addr};
4525
4526         $aliases{$addr} = [ ];
4527         $comment{$addr} = [ ];
4528         $description{$addr} = [ ];
4529         $note{$addr} = [ ];
4530         $file_path{$addr} = [ ];
4531         $locked{$addr} = "";
4532
4533         push @{$description{$addr}}, $description if $description;
4534         push @{$note{$addr}}, $note if $note;
4535
4536         if ($status{$addr} eq $PLACEHOLDER) {
4537
4538             # A placeholder table doesn't get documented, is a perl extension,
4539             # and quite likely will be empty
4540             $make_pod_entry = 0 if ! defined $make_pod_entry;
4541             $perl_extension = 1 if ! defined $perl_extension;
4542             push @tables_that_may_be_empty, $complete_name{$addr};
4543         }
4544         elsif (! $status{$addr}) {
4545
4546             # If hasn't set its status already, see if it is on one of the
4547             # lists of properties or tables that have particular statuses; if
4548             # not, is normal.  The lists are prioritized so the most serious
4549             # ones are checked first
4550             if (exists $why_suppressed{$complete_name}
4551                 # Don't suppress if overridden
4552                 && ! grep { $_ eq $complete_name{$addr} }
4553                                                     @output_mapped_properties)
4554             {
4555                 $status{$addr} = $SUPPRESSED;
4556             }
4557             elsif (exists $why_deprecated{$complete_name}) {
4558                 $status{$addr} = $DEPRECATED;
4559             }
4560             elsif (exists $why_stabilized{$complete_name}) {
4561                 $status{$addr} = $STABILIZED;
4562             }
4563             elsif (exists $why_obsolete{$complete_name}) {
4564                 $status{$addr} = $OBSOLETE;
4565             }
4566
4567             # Existence above doesn't necessarily mean there is a message
4568             # associated with it.  Use the most serious message.
4569             if ($status{$addr}) {
4570                 if ($why_suppressed{$complete_name}) {
4571                     $status_info{$addr}
4572                                 = $why_suppressed{$complete_name};
4573                 }
4574                 elsif ($why_deprecated{$complete_name}) {
4575                     $status_info{$addr}
4576                                 = $why_deprecated{$complete_name};
4577                 }
4578                 elsif ($why_stabilized{$complete_name}) {
4579                     $status_info{$addr}
4580                                 = $why_stabilized{$complete_name};
4581                 }
4582                 elsif ($why_obsolete{$complete_name}) {
4583                     $status_info{$addr}
4584                                 = $why_obsolete{$complete_name};
4585                 }
4586             }
4587         }
4588
4589         $perl_extension{$addr} = $perl_extension || 0;
4590
4591         # By convention what typically gets printed only or first is what's
4592         # first in the list, so put the full name there for good output
4593         # clarity.  Other routines rely on the full name being first on the
4594         # list
4595         $self->add_alias($full_name{$addr},
4596                             Externally_Ok => $externally_ok,
4597                             Fuzzy => $loose_match,
4598                             Pod_Entry => $make_pod_entry,
4599                             Status => $status{$addr},
4600                             );
4601
4602         # Then comes the other name, if meaningfully different.
4603         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4604             $self->add_alias($name{$addr},
4605                             Externally_Ok => $externally_ok,
4606                             Fuzzy => $loose_match,
4607                             Pod_Entry => $make_pod_entry,
4608                             Status => $status{$addr},
4609                             );
4610         }
4611
4612         return $self;
4613     }
4614
4615     # Here are the methods that are required to be defined by any derived
4616     # class
4617     for my $sub (qw(
4618                     handle_special_range
4619                     append_to_body
4620                     pre_body
4621                 ))
4622                 # write() knows how to write out normal ranges, but it calls
4623                 # handle_special_range() when it encounters a non-normal one.
4624                 # append_to_body() is called by it after it has handled all
4625                 # ranges to add anything after the main portion of the table.
4626                 # And finally, pre_body() is called after all this to build up
4627                 # anything that should appear before the main portion of the
4628                 # table.  Doing it this way allows things in the middle to
4629                 # affect what should appear before the main portion of the
4630                 # table.
4631     {
4632         no strict "refs";
4633         *$sub = sub {
4634             Carp::my_carp_bug( __LINE__
4635                               . ": Must create method '$sub()' for "
4636                               . ref shift);
4637             return;
4638         }
4639     }
4640
4641     use overload
4642         fallback => 0,
4643         "." => \&main::_operator_dot,
4644         '!=' => \&main::_operator_not_equal,
4645         '==' => \&main::_operator_equal,
4646     ;
4647
4648     sub ranges {
4649         # Returns the array of ranges associated with this table.
4650
4651         no overloading;
4652         return $range_list{pack 'J', shift}->ranges;
4653     }
4654
4655     sub add_alias {
4656         # Add a synonym for this table.
4657
4658         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4659
4660         my $self = shift;
4661         my $name = shift;       # The name to add.
4662         my $pointer = shift;    # What the alias hash should point to.  For
4663                                 # map tables, this is the parent property;
4664                                 # for match tables, it is the table itself.
4665
4666         my %args = @_;
4667         my $loose_match = delete $args{'Fuzzy'};
4668
4669         my $make_pod_entry = delete $args{'Pod_Entry'};
4670         $make_pod_entry = $YES unless defined $make_pod_entry;
4671
4672         my $externally_ok = delete $args{'Externally_Ok'};
4673         $externally_ok = 1 unless defined $externally_ok;
4674
4675         my $status = delete $args{'Status'};
4676         $status = $NORMAL unless defined $status;
4677
4678         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4679
4680         # Capitalize the first letter of the alias unless it is one of the CJK
4681         # ones which specifically begins with a lower 'k'.  Do this because
4682         # Unicode has varied whether they capitalize first letters or not, and
4683         # have later changed their minds and capitalized them, but not the
4684         # other way around.  So do it always and avoid changes from release to
4685         # release
4686         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4687
4688         my $addr = do { no overloading; pack 'J', $self; };
4689
4690         # Figure out if should be loosely matched if not already specified.
4691         if (! defined $loose_match) {
4692
4693             # Is a loose_match if isn't null, and doesn't begin with an
4694             # underscore and isn't just a number
4695             if ($name ne ""
4696                 && substr($name, 0, 1) ne '_'
4697                 && $name !~ qr{^[0-9_.+-/]+$})
4698             {
4699                 $loose_match = 1;
4700             }
4701             else {
4702                 $loose_match = 0;
4703             }
4704         }
4705
4706         # If this alias has already been defined, do nothing.
4707         return if defined $find_table_from_alias{$addr}->{$name};
4708
4709         # That includes if it is standardly equivalent to an existing alias,
4710         # in which case, add this name to the list, so won't have to search
4711         # for it again.
4712         my $standard_name = main::standardize($name);
4713         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4714             $find_table_from_alias{$addr}->{$name}
4715                         = $find_table_from_alias{$addr}->{$standard_name};
4716             return;
4717         }
4718
4719         # Set the index hash for this alias for future quick reference.
4720         $find_table_from_alias{$addr}->{$name} = $pointer;
4721         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4722         local $to_trace = 0 if main::DEBUG;
4723         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4724         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4725
4726
4727         # Put the new alias at the end of the list of aliases unless the final
4728         # element begins with an underscore (meaning it is for internal perl
4729         # use) or is all numeric, in which case, put the new one before that
4730         # one.  This floats any all-numeric or underscore-beginning aliases to
4731         # the end.  This is done so that they are listed last in output lists,
4732         # to encourage the user to use a better name (either more descriptive
4733         # or not an internal-only one) instead.  This ordering is relied on
4734         # implicitly elsewhere in this program, like in short_name()
4735         my $list = $aliases{$addr};
4736         my $insert_position = (@$list == 0
4737                                 || (substr($list->[-1]->name, 0, 1) ne '_'
4738                                     && $list->[-1]->name =~ /\D/))
4739                             ? @$list
4740                             : @$list - 1;
4741         splice @$list,
4742                 $insert_position,
4743                 0,
4744                 Alias->new($name, $loose_match, $make_pod_entry,
4745                                                     $externally_ok, $status);
4746
4747         # This name may be shorter than any existing ones, so clear the cache
4748         # of the shortest, so will have to be recalculated.
4749         no overloading;
4750         undef $short_name{pack 'J', $self};
4751         return;
4752     }
4753
4754     sub short_name {
4755         # Returns a name suitable for use as the base part of a file name.
4756         # That is, shorter wins.  It can return undef if there is no suitable
4757         # name.  The name has all non-essential underscores removed.
4758
4759         # The optional second parameter is a reference to a scalar in which
4760         # this routine will store the length the returned name had before the
4761         # underscores were removed, or undef if the return is undef.
4762
4763         # The shortest name can change if new aliases are added.  So using
4764         # this should be deferred until after all these are added.  The code
4765         # that does that should clear this one's cache.
4766         # Any name with alphabetics is preferred over an all numeric one, even
4767         # if longer.
4768
4769         my $self = shift;
4770         my $nominal_length_ptr = shift;
4771         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4772
4773         my $addr = do { no overloading; pack 'J', $self; };
4774
4775         # For efficiency, don't recalculate, but this means that adding new
4776         # aliases could change what the shortest is, so the code that does
4777         # that needs to undef this.
4778         if (defined $short_name{$addr}) {
4779             if ($nominal_length_ptr) {
4780                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4781             }
4782             return $short_name{$addr};
4783         }
4784
4785         # Look at each alias
4786         foreach my $alias ($self->aliases()) {
4787
4788             # Don't use an alias that isn't ok to use for an external name.
4789             next if ! $alias->externally_ok;
4790
4791             my $name = main::Standardize($alias->name);
4792             trace $self, $name if main::DEBUG && $to_trace;
4793
4794             # Take the first one, or a shorter one that isn't numeric.  This
4795             # relies on numeric aliases always being last in the array
4796             # returned by aliases().  Any alpha one will have precedence.
4797             if (! defined $short_name{$addr}
4798                 || ($name =~ /\D/
4799                     && length($name) < length($short_name{$addr})))
4800             {
4801                 # Remove interior underscores.
4802                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4803
4804                 $nominal_short_name_length{$addr} = length $name;
4805             }
4806         }
4807
4808         # If the short name isn't a nice one, perhaps an equivalent table has
4809         # a better one.
4810         if (! defined $short_name{$addr}
4811             || $short_name{$addr} eq ""
4812             || $short_name{$addr} eq "_")
4813         {
4814             my $return;
4815             foreach my $follower ($self->children) {    # All equivalents
4816                 my $follower_name = $follower->short_name;
4817                 next unless defined $follower_name;
4818
4819                 # Anything (except undefined) is better than underscore or
4820                 # empty
4821                 if (! defined $return || $return eq "_") {
4822                     $return = $follower_name;
4823                     next;
4824                 }
4825
4826                 # If the new follower name isn't "_" and is shorter than the
4827                 # current best one, prefer the new one.
4828                 next if $follower_name eq "_";
4829                 next if length $follower_name > length $return;
4830                 $return = $follower_name;
4831             }
4832             $short_name{$addr} = $return if defined $return;
4833         }
4834
4835         # If no suitable external name return undef
4836         if (! defined $short_name{$addr}) {
4837             $$nominal_length_ptr = undef if $nominal_length_ptr;
4838             return;
4839         }
4840
4841         # Don't allow a null short name.
4842         if ($short_name{$addr} eq "") {
4843             $short_name{$addr} = '_';
4844             $nominal_short_name_length{$addr} = 1;
4845         }
4846
4847         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4848
4849         if ($nominal_length_ptr) {
4850             $$nominal_length_ptr = $nominal_short_name_length{$addr};
4851         }
4852         return $short_name{$addr};
4853     }
4854
4855     sub external_name {
4856         # Returns the external name that this table should be known by.  This
4857         # is usually the short_name, but not if the short_name is undefined,
4858         # in which case the external_name is arbitrarily set to the
4859         # underscore.
4860
4861         my $self = shift;
4862         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4863
4864         my $short = $self->short_name;
4865         return $short if defined $short;
4866
4867         return '_';
4868     }
4869
4870     sub add_description { # Adds the parameter as a short description.
4871
4872         my $self = shift;
4873         my $description = shift;
4874         chomp $description;
4875         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4876
4877         no overloading;
4878         push @{$description{pack 'J', $self}}, $description;
4879
4880         return;
4881     }
4882
4883     sub add_note { # Adds the parameter as a short note.
4884
4885         my $self = shift;
4886         my $note = shift;
4887         chomp $note;
4888         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4889
4890         no overloading;
4891         push @{$note{pack 'J', $self}}, $note;
4892
4893         return;
4894     }
4895
4896     sub add_comment { # Adds the parameter as a comment.
4897
4898         return unless $debugging_build;
4899
4900         my $self = shift;
4901         my $comment = shift;
4902         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4903
4904         chomp $comment;
4905
4906         no overloading;
4907         push @{$comment{pack 'J', $self}}, $comment;
4908
4909         return;
4910     }
4911
4912     sub comment {
4913         # Return the current comment for this table.  If called in list
4914         # context, returns the array of comments.  In scalar, returns a string
4915         # of each element joined together with a period ending each.
4916
4917         my $self = shift;
4918         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4919
4920         my $addr = do { no overloading; pack 'J', $self; };
4921         my @list = @{$comment{$addr}};
4922         return @list if wantarray;
4923         my $return = "";
4924         foreach my $sentence (@list) {
4925             $return .= '.  ' if $return;
4926             $return .= $sentence;
4927             $return =~ s/\.$//;
4928         }
4929         $return .= '.' if $return;
4930         return $return;
4931     }
4932
4933     sub initialize {
4934         # Initialize the table with the argument which is any valid
4935         # initialization for range lists.
4936
4937         my $self = shift;
4938         my $addr = do { no overloading; pack 'J', $self; };
4939         my $initialization = shift;
4940         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4941
4942         # Replace the current range list with a new one of the same exact
4943         # type.
4944         my $class = ref $range_list{$addr};
4945         $range_list{$addr} = $class->new(Owner => $self,
4946                                         Initialize => $initialization);
4947         return;
4948
4949     }
4950
4951     sub header {
4952         # The header that is output for the table in the file it is written
4953         # in.
4954
4955         my $self = shift;
4956         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4957
4958         my $return = "";
4959         $return .= $DEVELOPMENT_ONLY if $compare_versions;
4960         $return .= $HEADER;
4961         no overloading;
4962         $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self};
4963         return $return;
4964     }
4965
4966     sub write {
4967         # Write a representation of the table to its file.  It calls several
4968         # functions furnished by sub-classes of this abstract base class to
4969         # handle non-normal ranges, to add stuff before the table, and at its
4970         # end.
4971
4972         my $self = shift;
4973         my $tab_stops = shift;       # The number of tab stops over to put any
4974                                      # comment.
4975         my $suppress_value = shift;  # Optional, if the value associated with
4976                                      # a range equals this one, don't write
4977                                      # the range
4978         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4979
4980         my $addr = do { no overloading; pack 'J', $self; };
4981
4982         # Start with the header
4983         my @HEADER = $self->header;
4984
4985         # Then the comments
4986         push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
4987                                                         if $comment{$addr};
4988
4989         # Things discovered processing the main body of the document may
4990         # affect what gets output before it, therefore pre_body() isn't called
4991         # until after all other processing of the table is done.
4992
4993         # The main body looks like a 'here' document.  If annotating, get rid
4994         # of the comments before passing to the caller, as some callers, such
4995         # as charnames.pm, can't cope with them.  (Outputting range counts
4996         # also introduces comments, but these don't show up in the tables that
4997         # can't cope with comments, and there aren't that many of them that
4998         # it's worth the extra real time to get rid of them).
4999         my @OUT;
5000         if ($annotate) {
5001             # Use the line below in Perls that don't have /r
5002             #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5003             push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5004         } else {
5005             push @OUT, "return <<'END';\n";
5006         }
5007
5008         if ($range_list{$addr}->is_empty) {
5009
5010             # This is a kludge for empty tables to silence a warning in
5011             # utf8.c, which can't really deal with empty tables, but it can
5012             # deal with a table that matches nothing, as the inverse of 'Any'
5013             # does.
5014             push @OUT, "!utf8::Any\n";
5015         }
5016         elsif ($self->name eq 'N'
5017
5018                # To save disk space and table cache space, avoid putting out
5019                # binary N tables, but instead create a file which just inverts
5020                # the Y table.  Since the file will still exist and occupy a
5021                # certain number of blocks, might as well output the whole
5022                # thing if it all will fit in one block.   The number of
5023                # ranges below is an approximate number for that.
5024                && $self->property->type == $BINARY
5025                # && $self->property->tables == 2  Can't do this because the
5026                #        non-binary properties, like NFDQC aren't specifiable
5027                #        by the notation
5028                && $range_list{$addr}->ranges > 15
5029                && ! $annotate)  # Under --annotate, want to see everything
5030         {
5031             push @OUT, "!utf8::" . $self->property->name . "\n";
5032         }
5033         else {
5034             my $range_size_1 = $range_size_1{$addr};
5035             my $format;            # Used only in $annotate option
5036             my $include_name;      # Used only in $annotate option
5037
5038             if ($annotate) {
5039
5040                 # if annotating each code point, must print 1 per line.
5041                 # The variable could point to a subroutine, and we don't want
5042                 # to lose that fact, so only set if not set already
5043                 $range_size_1 = 1 if ! $range_size_1;
5044
5045                 $format = $self->format;
5046
5047                 # The name of the character is output only for tables that
5048                 # don't already include the name in the output.
5049                 my $property = $self->property;
5050                 $include_name =
5051                     !  ($property == $perl_charname
5052                         || $property == main::property_ref('Unicode_1_Name')
5053                         || $property == main::property_ref('Name')
5054                         || $property == main::property_ref('Name_Alias')
5055                        );
5056             }
5057
5058             # Output each range as part of the here document.
5059             RANGE:
5060             for my $set ($range_list{$addr}->ranges) {
5061                 if ($set->type != 0) {
5062                     $self->handle_special_range($set);
5063                     next RANGE;
5064                 }
5065                 my $start = $set->start;
5066                 my $end   = $set->end;
5067                 my $value  = $set->value;
5068
5069                 # Don't output ranges whose value is the one to suppress
5070                 next RANGE if defined $suppress_value
5071                               && $value eq $suppress_value;
5072
5073                 # If there is a range and doesn't need a single point range
5074                 # output
5075                 if ($start != $end && ! $range_size_1) {
5076                     push @OUT, sprintf "%04X\t%04X", $start, $end;
5077                     $OUT[-1] .= "\t$value" if $value ne "";
5078
5079                     # Add a comment with the size of the range, if requested.
5080                     # Expand Tabs to make sure they all start in the same
5081                     # column, and then unexpand to use mostly tabs.
5082                     if (! $output_range_counts{$addr}) {
5083                         $OUT[-1] .= "\n";
5084                     }
5085                     else {
5086                         $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5087                         my $count = main::clarify_number($end - $start + 1);
5088                         use integer;
5089
5090                         my $width = $tab_stops * 8 - 1;
5091                         $OUT[-1] = sprintf("%-*s # [%s]\n",
5092                                             $width,
5093                                             $OUT[-1],
5094                                             $count);
5095                         $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5096                     }
5097                     next RANGE;
5098                 }
5099
5100                 # Here to output a single code point per line
5101
5102                 # If not to annotate, use the simple formats
5103                 if (! $annotate) {
5104
5105                     # Use any passed in subroutine to output.
5106                     if (ref $range_size_1 eq 'CODE') {
5107                         for my $i ($start .. $end) {
5108                             push @OUT, &{$range_size_1}($i, $value);
5109                         }
5110                     }
5111                     else {
5112
5113                         # Here, caller is ok with default output.
5114                         for (my $i = $start; $i <= $end; $i++) {
5115                             push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5116                         }
5117                     }
5118                     next RANGE;
5119                 }
5120
5121                 # Here, wants annotation.
5122                 for (my $i = $start; $i <= $end; $i++) {
5123
5124                     # Get character information if don't have it already
5125                     main::populate_char_info($i)
5126                                         if ! defined $viacode[$i];
5127                     my $type = $annotate_char_type[$i];
5128
5129                     # Figure out if should output the next code points as part
5130                     # of a range or not.  If this is not in an annotation
5131                     # range, then won't output as a range, so returns $i.
5132                     # Otherwise use the end of the annotation range, but no
5133                     # further than the maximum possible end point of the loop.
5134                     my $range_end = main::min($annotate_ranges->value_of($i)
5135                                                                         || $i,
5136                                                $end);
5137
5138                     # Use a range if it is a range, and either is one of the
5139                     # special annotation ranges, or the range is at most 3
5140                     # long.  This last case causes the algorithmically named
5141                     # code points to be output individually in spans of at
5142                     # most 3, as they are the ones whose $type is > 0.
5143                     if ($range_end != $i
5144                         && ( $type < 0 || $range_end - $i > 2))
5145                     {
5146                         # Here is to output a range.  We don't allow a
5147                         # caller-specified output format--just use the
5148                         # standard one.
5149                         push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5150                                                                 $range_end,
5151                                                                 $value;
5152                         my $range_name = $viacode[$i];
5153
5154                         # For the code points which end in their hex value, we
5155                         # eliminate that from the output annotation, and
5156                         # capitalize only the first letter of each word.
5157                         if ($type == $CP_IN_NAME) {
5158                             my $hex = sprintf "%04X", $i;
5159                             $range_name =~ s/-$hex$//;
5160                             my @words = split " ", $range_name;
5161                             for my $word (@words) {
5162                                 $word = ucfirst(lc($word)) if $word ne 'CJK';
5163                             }
5164                             $range_name = join " ", @words;
5165                         }
5166                         elsif ($type == $HANGUL_SYLLABLE) {
5167                             $range_name = "Hangul Syllable";
5168                         }
5169
5170                         $OUT[-1] .= " $range_name" if $range_name;
5171
5172                         # Include the number of code points in the range
5173                         my $count = main::clarify_number($range_end - $i + 1);
5174                         $OUT[-1] .= " [$count]\n";
5175
5176                         # Skip to the end of the range
5177                         $i = $range_end;
5178                     }
5179                     else { # Not in a range.
5180                         my $comment = "";
5181
5182                         # When outputting the names of each character, use
5183                         # the character itself if printable
5184                         $comment .= "'" . chr($i) . "' " if $printable[$i];
5185
5186                         # To make it more readable, use a minimum indentation
5187                         my $comment_indent;
5188
5189                         # Determine the annotation
5190                         if ($format eq $DECOMP_STRING_FORMAT) {
5191
5192                             # This is very specialized, with the type of
5193                             # decomposition beginning the line enclosed in
5194                             # <...>, and the code points that the code point
5195                             # decomposes to separated by blanks.  Create two
5196                             # strings, one of the printable characters, and
5197                             # one of their official names.
5198                             (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5199                             my $tostr = "";
5200                             my $to_name = "";
5201                             my $to_chr = "";
5202                             foreach my $to (split " ", $map) {
5203                                 $to = CORE::hex $to;
5204                                 $to_name .= " + " if $to_name;
5205                                 $to_chr .= chr($to);
5206                                 main::populate_char_info($to)
5207                                                     if ! defined $viacode[$to];
5208                                 $to_name .=  $viacode[$to];
5209                             }
5210
5211                             $comment .=
5212                                     "=> '$to_chr'; $viacode[$i] => $to_name";
5213                             $comment_indent = 25;   # Determined by experiment
5214                         }
5215                         else {
5216
5217                             # Assume that any table that has hex format is a
5218                             # mapping of one code point to another.
5219                             if ($format eq $HEX_FORMAT) {
5220                                 my $decimal_value = CORE::hex $value;
5221                                 main::populate_char_info($decimal_value)
5222                                         if ! defined $viacode[$decimal_value];
5223                                 $comment .= "=> '"
5224                                          . chr($decimal_value)
5225                                          . "'; " if $printable[$decimal_value];
5226                             }
5227                             $comment .= $viacode[$i] if $include_name
5228                                                         && $viacode[$i];
5229                             if ($format eq $HEX_FORMAT) {
5230                                 my $decimal_value = CORE::hex $value;
5231                                 $comment .= " => $viacode[$decimal_value]"
5232                                                     if $viacode[$decimal_value];
5233                             }
5234
5235                             # If including the name, no need to indent, as the
5236                             # name will already be way across the line.
5237                             $comment_indent = ($include_name) ? 0 : 60;
5238                         }
5239
5240                         # Use any passed in routine to output the base part of
5241                         # the line.
5242                         if (ref $range_size_1 eq 'CODE') {
5243                             my $base_part = &{$range_size_1}($i, $value);
5244                             chomp $base_part;
5245                             push @OUT, $base_part;
5246                         }
5247                         else {
5248                             push @OUT, sprintf "%04X\t\t%s", $i, $value;
5249                         }
5250
5251                         # And add the annotation.
5252                         $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
5253                                                          $OUT[-1],
5254                                                          $comment if $comment;
5255                         $OUT[-1] .= "\n";
5256                     }
5257                 }
5258             } # End of loop through all the table's ranges
5259         }
5260
5261         # Add anything that goes after the main body, but within the here
5262         # document,
5263         my $append_to_body = $self->append_to_body;
5264         push @OUT, $append_to_body if $append_to_body;
5265
5266         # And finish the here document.
5267         push @OUT, "END\n";
5268
5269         # Done with the main portion of the body.  Can now figure out what
5270         # should appear before it in the file.
5271         my $pre_body = $self->pre_body;
5272         push @HEADER, $pre_body, "\n" if $pre_body;
5273
5274         # All these files should have a .pl suffix added to them.
5275         my @file_with_pl = @{$file_path{$addr}};
5276         $file_with_pl[-1] .= '.pl';
5277
5278         main::write(\@file_with_pl,
5279                     $annotate,      # utf8 iff annotating
5280                     \@HEADER,
5281                     \@OUT);
5282         return;
5283     }
5284
5285     sub set_status {    # Set the table's status
5286         my $self = shift;
5287         my $status = shift; # The status enum value
5288         my $info = shift;   # Any message associated with it.
5289         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5290
5291         my $addr = do { no overloading; pack 'J', $self; };
5292
5293         $status{$addr} = $status;
5294         $status_info{$addr} = $info;
5295         return;
5296     }
5297
5298     sub lock {
5299         # Don't allow changes to the table from now on.  This stores a stack
5300         # trace of where it was called, so that later attempts to modify it
5301         # can immediately show where it got locked.
5302
5303         my $self = shift;
5304         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5305
5306         my $addr = do { no overloading; pack 'J', $self; };
5307
5308         $locked{$addr} = "";
5309
5310         my $line = (caller(0))[2];
5311         my $i = 1;
5312
5313         # Accumulate the stack trace
5314         while (1) {
5315             my ($pkg, $file, $caller_line, $caller) = caller $i++;
5316
5317             last unless defined $caller;
5318
5319             $locked{$addr} .= "    called from $caller() at line $line\n";
5320             $line = $caller_line;
5321         }
5322         $locked{$addr} .= "    called from main at line $line\n";
5323
5324         return;
5325     }
5326
5327     sub carp_if_locked {
5328         # Return whether a table is locked or not, and, by the way, complain
5329         # if is locked
5330
5331         my $self = shift;
5332         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5333
5334         my $addr = do { no overloading; pack 'J', $self; };
5335
5336         return 0 if ! $locked{$addr};
5337         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5338         return 1;
5339     }
5340
5341     sub set_file_path { # Set the final directory path for this table
5342         my $self = shift;
5343         # Rest of parameters passed on
5344
5345         no overloading;
5346         @{$file_path{pack 'J', $self}} = @_;
5347         return
5348     }
5349
5350     # Accessors for the range list stored in this table.  First for
5351     # unconditional
5352     for my $sub (qw(
5353                     containing_range
5354                     contains
5355                     count
5356                     each_range
5357                     hash
5358                     is_empty
5359                     matches_identically_to
5360                     max
5361                     min
5362                     range_count
5363                     reset_each_range
5364                     type_of
5365                     value_of
5366                 ))
5367     {
5368         no strict "refs";
5369         *$sub = sub {
5370             use strict "refs";
5371             my $self = shift;
5372             no overloading;
5373             return $range_list{pack 'J', $self}->$sub(@_);
5374         }
5375     }
5376
5377     # Then for ones that should fail if locked
5378     for my $sub (qw(
5379                     delete_range
5380                 ))
5381     {
5382         no strict "refs";
5383         *$sub = sub {
5384             use strict "refs";
5385             my $self = shift;
5386
5387             return if $self->carp_if_locked;
5388             no overloading;
5389             return $range_list{pack 'J', $self}->$sub(@_);
5390         }
5391     }
5392
5393 } # End closure
5394
5395 package Map_Table;
5396 use base '_Base_Table';
5397
5398 # A Map Table is a table that contains the mappings from code points to
5399 # values.  There are two weird cases:
5400 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5401 #    are written in the table's file at the end of the table nonetheless.  It
5402 #    requires specially constructed code to handle these; utf8.c can not read
5403 #    these in, so they should not go in $map_directory.  As of this writing,
5404 #    the only case that these happen is for named sequences used in
5405 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
5406 #    something else could come along that uses it.
5407 # 2) Specials are anything that doesn't fit syntactically into the body of the
5408 #    table.  The ranges for these have a map type of non-zero.  The code below
5409 #    knows about and handles each possible type.   In most cases, these are
5410 #    written as part of the header.
5411 #
5412 # A map table deliberately can't be manipulated at will unlike match tables.
5413 # This is because of the ambiguities having to do with what to do with
5414 # overlapping code points.  And there just isn't a need for those things;
5415 # what one wants to do is just query, add, replace, or delete mappings, plus
5416 # write the final result.
5417 # However, there is a method to get the list of possible ranges that aren't in
5418 # this table to use for defaulting missing code point mappings.  And,
5419 # map_add_or_replace_non_nulls() does allow one to add another table to this
5420 # one, but it is clearly very specialized, and defined that the other's
5421 # non-null values replace this one's if there is any overlap.
5422
5423 sub trace { return main::trace(@_); }
5424
5425 { # Closure
5426
5427     main::setup_package();
5428
5429     my %default_map;
5430     # Many input files omit some entries; this gives what the mapping for the
5431     # missing entries should be
5432     main::set_access('default_map', \%default_map, 'r');
5433
5434     my %anomalous_entries;
5435     # Things that go in the body of the table which don't fit the normal
5436     # scheme of things, like having a range.  Not much can be done with these
5437     # once there except to output them.  This was created to handle named
5438     # sequences.
5439     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5440     main::set_access('anomalous_entries',       # Append singular, read plural
5441                     \%anomalous_entries,
5442                     'readable_array');
5443
5444     my %core_access;
5445     # This is a string, solely for documentation, indicating how one can get
5446     # access to this property via the Perl core.
5447     main::set_access('core_access', \%core_access, 'r', 's');
5448
5449     my %to_output_map;
5450     # Enum as to whether or not to write out this map table:
5451     #   0               don't output
5452     #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
5453     #                   it should not be removed nor its format changed.  This
5454     #                   is done for those files that have traditionally been
5455     #                   output.
5456     #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
5457     #                   with this file
5458     main::set_access('to_output_map', \%to_output_map, 's');
5459
5460
5461     sub new {
5462         my $class = shift;
5463         my $name = shift;
5464
5465         my %args = @_;
5466
5467         # Optional initialization data for the table.
5468         my $initialize = delete $args{'Initialize'};
5469
5470         my $core_access = delete $args{'Core_Access'};
5471         my $default_map = delete $args{'Default_Map'};
5472         my $property = delete $args{'_Property'};
5473         my $full_name = delete $args{'Full_Name'};
5474
5475         # Rest of parameters passed on
5476
5477         my $range_list = Range_Map->new(Owner => $property);
5478
5479         my $self = $class->SUPER::new(
5480                                     Name => $name,
5481                                     Complete_Name =>  $full_name,
5482                                     Full_Name => $full_name,
5483                                     _Property => $property,
5484                                     _Range_List => $range_list,
5485                                     %args);
5486
5487         my $addr = do { no overloading; pack 'J', $self; };
5488
5489         $anomalous_entries{$addr} = [];
5490         $core_access{$addr} = $core_access;
5491         $default_map{$addr} = $default_map;
5492
5493         $self->initialize($initialize) if defined $initialize;
5494
5495         return $self;
5496     }
5497
5498     use overload
5499         fallback => 0,
5500         qw("") => "_operator_stringify",
5501     ;
5502
5503     sub _operator_stringify {
5504         my $self = shift;
5505
5506         my $name = $self->property->full_name;
5507         $name = '""' if $name eq "";
5508         return "Map table for Property '$name'";
5509     }
5510
5511     sub add_alias {
5512         # Add a synonym for this table (which means the property itself)
5513         my $self = shift;
5514         my $name = shift;
5515         # Rest of parameters passed on.
5516
5517         $self->SUPER::add_alias($name, $self->property, @_);
5518         return;
5519     }
5520
5521     sub add_map {
5522         # Add a range of code points to the list of specially-handled code
5523         # points.  $MULTI_CP is assumed if the type of special is not passed
5524         # in.
5525
5526         my $self = shift;
5527         my $lower = shift;
5528         my $upper = shift;
5529         my $string = shift;
5530         my %args = @_;
5531
5532         my $type = delete $args{'Type'} || 0;
5533         # Rest of parameters passed on
5534
5535         # Can't change the table if locked.
5536         return if $self->carp_if_locked;
5537
5538         my $addr = do { no overloading; pack 'J', $self; };
5539
5540         $self->_range_list->add_map($lower, $upper,
5541                                     $string,
5542                                     @_,
5543                                     Type => $type);
5544         return;
5545     }
5546
5547     sub append_to_body {
5548         # Adds to the written HERE document of the table's body any anomalous
5549         # entries in the table..
5550
5551         my $self = shift;
5552         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5553
5554         my $addr = do { no overloading; pack 'J', $self; };
5555
5556         return "" unless @{$anomalous_entries{$addr}};
5557         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5558     }
5559
5560     sub map_add_or_replace_non_nulls {
5561         # This adds the mappings in the table $other to $self.  Non-null
5562         # mappings from $other override those in $self.  It essentially merges
5563         # the two tables, with the second having priority except for null
5564         # mappings.
5565
5566         my $self = shift;
5567         my $other = shift;
5568         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5569
5570         return if $self->carp_if_locked;
5571
5572         if (! $other->isa(__PACKAGE__)) {
5573             Carp::my_carp_bug("$other should be a "
5574                         . __PACKAGE__
5575                         . ".  Not a '"
5576                         . ref($other)
5577                         . "'.  Not added;");
5578             return;
5579         }
5580
5581         my $addr = do { no overloading; pack 'J', $self; };
5582         my $other_addr = do { no overloading; pack 'J', $other; };
5583
5584         local $to_trace = 0 if main::DEBUG;
5585
5586         my $self_range_list = $self->_range_list;
5587         my $other_range_list = $other->_range_list;
5588         foreach my $range ($other_range_list->ranges) {
5589             my $value = $range->value;
5590             next if $value eq "";
5591             $self_range_list->_add_delete('+',
5592                                           $range->start,
5593                                           $range->end,
5594                                           $value,
5595                                           Type => $range->type,
5596                                           Replace => $UNCONDITIONALLY);
5597         }
5598
5599         return;
5600     }
5601
5602     sub set_default_map {
5603         # Define what code points that are missing from the input files should
5604         # map to
5605
5606         my $self = shift;
5607         my $map = shift;
5608         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5609
5610         my $addr = do { no overloading; pack 'J', $self; };
5611
5612         # Convert the input to the standard equivalent, if any (won't have any
5613         # for $STRING properties)
5614         my $standard = $self->_find_table_from_alias->{$map};
5615         $map = $standard->name if defined $standard;
5616
5617         # Warn if there already is a non-equivalent default map for this
5618         # property.  Note that a default map can be a ref, which means that
5619         # what it actually means is delayed until later in the program, and it
5620         # IS permissible to override it here without a message.
5621         my $default_map = $default_map{$addr};
5622         if (defined $default_map
5623             && ! ref($default_map)
5624             && $default_map ne $map
5625             && main::Standardize($map) ne $default_map)
5626         {
5627             my $property = $self->property;
5628             my $map_table = $property->table($map);
5629             my $default_table = $property->table($default_map);
5630             if (defined $map_table
5631                 && defined $default_table
5632                 && $map_table != $default_table)
5633             {
5634                 Carp::my_carp("Changing the default mapping for "
5635                             . $property
5636                             . " from $default_map to $map'");
5637             }
5638         }
5639
5640         $default_map{$addr} = $map;
5641
5642         # Don't also create any missing table for this map at this point,
5643         # because if we did, it could get done before the main table add is
5644         # done for PropValueAliases.txt; instead the caller will have to make
5645         # sure it exists, if desired.
5646         return;
5647     }
5648
5649     sub to_output_map {
5650         # Returns boolean: should we write this map table?
5651
5652         my $self = shift;
5653         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5654
5655         my $addr = do { no overloading; pack 'J', $self; };
5656
5657         # If overridden, use that
5658         return $to_output_map{$addr} if defined $to_output_map{$addr};
5659
5660         my $full_name = $self->full_name;
5661         return $global_to_output_map{$full_name}
5662                                 if defined $global_to_output_map{$full_name};
5663
5664         # If table says to output, do so; if says to suppress it, do so.
5665         return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
5666         return 0 if $self->status eq $SUPPRESSED;
5667
5668         my $type = $self->property->type;
5669
5670         # Don't want to output binary map tables even for debugging.
5671         return 0 if $type == $BINARY;
5672
5673         # But do want to output string ones.
5674         return $EXTERNAL_MAP if $type == $STRING;
5675
5676         # Otherwise is an $ENUM, do output it, for Perl's purposes
5677         return $INTERNAL_MAP;
5678     }
5679
5680     sub inverse_list {
5681         # Returns a Range_List that is gaps of the current table.  That is,
5682         # the inversion
5683
5684         my $self = shift;
5685         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5686
5687         my $current = Range_List->new(Initialize => $self->_range_list,
5688                                 Owner => $self->property);
5689         return ~ $current;
5690     }
5691
5692     sub header {
5693         my $self = shift;
5694         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5695
5696         my $return = $self->SUPER::header();
5697
5698         $return .= $INTERNAL_ONLY if $self->to_output_map == $INTERNAL_MAP;
5699         return $return;
5700     }
5701
5702     sub set_final_comment {
5703         # Just before output, create the comment that heads the file
5704         # containing this table.
5705
5706         return unless $debugging_build;
5707
5708         my $self = shift;
5709         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5710
5711         # No sense generating a comment if aren't going to write it out.
5712         return if ! $self->to_output_map;
5713
5714         my $addr = do { no overloading; pack 'J', $self; };
5715
5716         my $property = $self->property;
5717
5718         # Get all the possible names for this property.  Don't use any that
5719         # aren't ok for use in a file name, etc.  This is perhaps causing that
5720         # flag to do double duty, and may have to be changed in the future to
5721         # have our own flag for just this purpose; but it works now to exclude
5722         # Perl generated synonyms from the lists for properties, where the
5723         # name is always the proper Unicode one.
5724         my @property_aliases = grep { $_->externally_ok } $self->aliases;
5725
5726         my $count = $self->count;
5727         my $default_map = $default_map{$addr};
5728
5729         # The ranges that map to the default aren't output, so subtract that
5730         # to get those actually output.  A property with matching tables
5731         # already has the information calculated.
5732         if ($property->type != $STRING) {
5733             $count -= $property->table($default_map)->count;
5734         }
5735         elsif (defined $default_map) {
5736
5737             # But for $STRING properties, must calculate now.  Subtract the
5738             # count from each range that maps to the default.
5739             foreach my $range ($self->_range_list->ranges) {
5740                 if ($range->value eq $default_map) {
5741                     $count -= $range->end +1 - $range->start;
5742                 }
5743             }
5744
5745         }
5746
5747         # Get a  string version of $count with underscores in large numbers,
5748         # for clarity.
5749         my $string_count = main::clarify_number($count);
5750
5751         my $code_points = ($count == 1)
5752                         ? 'single code point'
5753                         : "$string_count code points";
5754
5755         my $mapping;
5756         my $these_mappings;
5757         my $are;
5758         if (@property_aliases <= 1) {
5759             $mapping = 'mapping';
5760             $these_mappings = 'this mapping';
5761             $are = 'is'
5762         }
5763         else {
5764             $mapping = 'synonymous mappings';
5765             $these_mappings = 'these mappings';
5766             $are = 'are'
5767         }
5768         my $cp;
5769         if ($count >= $MAX_UNICODE_CODEPOINTS) {
5770             $cp = "any code point in Unicode Version $string_version";
5771         }
5772         else {
5773             my $map_to;
5774             if ($default_map eq "") {
5775                 $map_to = 'the null string';
5776             }
5777             elsif ($default_map eq $CODE_POINT) {
5778                 $map_to = "itself";
5779             }
5780             else {
5781                 $map_to = "'$default_map'";
5782             }
5783             if ($count == 1) {
5784                 $cp = "the single code point";
5785             }
5786             else {
5787                 $cp = "one of the $code_points";
5788             }
5789             $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5790         }
5791
5792         my $comment = "";
5793
5794         my $status = $self->status;
5795         if ($status) {
5796             my $warn = uc $status_past_participles{$status};
5797             $comment .= <<END;
5798
5799 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
5800  All property or property=value combinations contained in this file are $warn.
5801  See $unicode_reference_url for what this means.
5802
5803 END
5804         }
5805         $comment .= "This file returns the $mapping:\n";
5806
5807         for my $i (0 .. @property_aliases - 1) {
5808             $comment .= sprintf("%-8s%s\n",
5809                                 " ",
5810                                 $property_aliases[$i]->name . '(cp)'
5811                                 );
5812         }
5813         $comment .=
5814                 "\nwhere 'cp' is $cp.  Note that $these_mappings $are ";
5815
5816         my $access = $core_access{$addr};
5817         if ($access) {
5818             $comment .= "accessible through the Perl core via $access.";
5819         }
5820         else {
5821             $comment .= "not accessible through the Perl core directly.";
5822         }
5823
5824         # And append any commentary already set from the actual property.
5825         $comment .= "\n\n" . $self->comment if $self->comment;
5826         if ($self->description) {
5827             $comment .= "\n\n" . join " ", $self->description;
5828         }
5829         if ($self->note) {
5830             $comment .= "\n\n" . join " ", $self->note;
5831         }
5832         $comment .= "\n";
5833
5834         if (! $self->perl_extension) {
5835             $comment .= <<END;
5836
5837 For information about what this property really means, see:
5838 $unicode_reference_url
5839 END
5840         }
5841
5842         if ($count) {        # Format differs for empty table
5843                 $comment.= "\nThe format of the ";
5844             if ($self->range_size_1) {
5845                 $comment.= <<END;
5846 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5847 is in hex; MAPPING is what CODE_POINT maps to.
5848 END
5849             }
5850             else {
5851
5852                 # There are tables which end up only having one element per
5853                 # range, but it is not worth keeping track of for making just
5854                 # this comment a little better.
5855                 $comment.= <<END;
5856 non-comment portions of the main body of lines of this file is:
5857 START\\tSTOP\\tMAPPING where START is the starting code point of the
5858 range, in hex; STOP is the ending point, or if omitted, the range has just one
5859 code point; MAPPING is what each code point between START and STOP maps to.
5860 END
5861                 if ($self->output_range_counts) {
5862                     $comment .= <<END;
5863 Numbers in comments in [brackets] indicate how many code points are in the
5864 range (omitted when the range is a single code point or if the mapping is to
5865 the null string).
5866 END
5867                 }
5868             }
5869         }
5870         $self->set_comment(main::join_lines($comment));
5871         return;
5872     }
5873
5874     my %swash_keys; # Makes sure don't duplicate swash names.
5875
5876     # The remaining variables are temporaries used while writing each table,
5877     # to output special ranges.
5878     my $has_hangul_syllables;
5879     my @multi_code_point_maps;  # Map is to more than one code point.
5880
5881     # The key is the base name of the code point, and the value is an
5882     # array giving all the ranges that use this base name.  Each range
5883     # is actually a hash giving the 'low' and 'high' values of it.
5884     my %names_ending_in_code_point;
5885     my %loose_names_ending_in_code_point;
5886
5887     # Inverse mapping.  The list of ranges that have these kinds of
5888     # names.  Each element contains the low, high, and base names in an
5889     # anonymous hash.
5890     my @code_points_ending_in_code_point;
5891
5892     sub handle_special_range {
5893         # Called in the middle of write when it finds a range it doesn't know
5894         # how to handle.
5895
5896         my $self = shift;
5897         my $range = shift;
5898         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5899
5900         my $addr = do { no overloading; pack 'J', $self; };
5901
5902         my $type = $range->type;
5903
5904         my $low = $range->start;
5905         my $high = $range->end;
5906         my $map = $range->value;
5907
5908         # No need to output the range if it maps to the default.
5909         return if $map eq $default_map{$addr};
5910
5911         # Switch based on the map type...
5912         if ($type == $HANGUL_SYLLABLE) {
5913
5914             # These are entirely algorithmically determinable based on
5915             # some constants furnished by Unicode; for now, just set a
5916             # flag to indicate that have them.  After everything is figured
5917             # out, we will output the code that does the algorithm.
5918             $has_hangul_syllables = 1;
5919         }
5920         elsif ($type == $CP_IN_NAME) {
5921
5922             # Code points whose the name ends in their code point are also
5923             # algorithmically determinable, but need information about the map
5924             # to do so.  Both the map and its inverse are stored in data
5925             # structures output in the file.
5926             push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5927             push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5928
5929             my $squeezed = $map =~ s/[-\s]+//gr;
5930             push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, $low;
5931             push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, $high;
5932
5933             push @code_points_ending_in_code_point, { low => $low,
5934                                                       high => $high,
5935                                                       name => $map
5936                                                     };
5937         }
5938         elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5939
5940             # Multi-code point maps and null string maps have an entry
5941             # for each code point in the range.  They use the same
5942             # output format.
5943             for my $code_point ($low .. $high) {
5944
5945                 # The pack() below can't cope with surrogates.  XXX This may
5946                 # no longer be true
5947                 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
5948                     Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
5949                     next;
5950                 }
5951
5952                 # Generate the hash entries for these in the form that
5953                 # utf8.c understands.
5954                 my $tostr = "";
5955                 my $to_name = "";
5956                 my $to_chr = "";
5957                 foreach my $to (split " ", $map) {
5958                     if ($to !~ /^$code_point_re$/) {
5959                         Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
5960                         next;
5961                     }
5962                     $tostr .= sprintf "\\x{%s}", $to;
5963                     $to = CORE::hex $to;
5964                     if ($annotate) {
5965                         $to_name .= " + " if $to_name;
5966                         $to_chr .= chr($to);
5967                         main::populate_char_info($to)
5968                                             if ! defined $viacode[$to];
5969                         $to_name .=  $viacode[$to];
5970                     }
5971                 }
5972
5973                 # I (khw) have never waded through this line to
5974                 # understand it well enough to comment it.
5975                 my $utf8 = sprintf(qq["%s" => "$tostr",],
5976                         join("", map { sprintf "\\x%02X", $_ }
5977                             unpack("U0C*", pack("U", $code_point))));
5978
5979                 # Add a comment so that a human reader can more easily
5980                 # see what's going on.
5981                 push @multi_code_point_maps,
5982                         sprintf("%-45s # U+%04X", $utf8, $code_point);
5983                 if (! $annotate) {
5984                     $multi_code_point_maps[-1] .= " => $map";
5985                 }
5986                 else {
5987                     main::populate_char_info($code_point)
5988                                     if ! defined $viacode[$code_point];
5989                     $multi_code_point_maps[-1] .= " '"
5990                         . chr($code_point)
5991                         . "' => '$to_chr'; $viacode[$code_point] => $to_name";
5992                 }
5993             }
5994         }
5995         else {
5996             Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
5997         }
5998
5999         return;
6000     }
6001
6002     sub pre_body {
6003         # Returns the string that should be output in the file before the main
6004         # body of this table.  It isn't called until the main body is
6005         # calculated, saving a pass.  The string includes some hash entries
6006         # identifying the format of the body, and what the single value should
6007         # be for all ranges missing from it.  It also includes any code points
6008         # which have map_types that don't go in the main table.
6009
6010         my $self = shift;
6011         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6012
6013         my $addr = do { no overloading; pack 'J', $self; };
6014
6015         my $name = $self->property->swash_name;
6016
6017         if (defined $swash_keys{$name}) {
6018             Carp::my_carp(join_lines(<<END
6019 Already created a swash name '$name' for $swash_keys{$name}.  This means that
6020 the same name desired for $self shouldn't be used.  Bad News.  This must be
6021 fixed before production use, but proceeding anyway
6022 END
6023             ));
6024         }
6025         $swash_keys{$name} = "$self";
6026
6027         my $pre_body = "";
6028
6029         # Here we assume we were called after have gone through the whole
6030         # file.  If we actually generated anything for each map type, add its
6031         # respective header and trailer
6032         my $specials_name = "";
6033         if (@multi_code_point_maps) {
6034             $specials_name = "utf8::ToSpec$name";
6035             $pre_body .= <<END;
6036
6037 # Some code points require special handling because their mappings are each to
6038 # multiple code points.  These do not appear in the main body, but are defined
6039 # in the hash below.
6040
6041 # Each key is the string of N bytes that together make up the UTF-8 encoding
6042 # for the code point.  (i.e. the same as looking at the code point's UTF-8
6043 # under "use bytes").  Each value is the UTF-8 of the translation, for speed.
6044 \%$specials_name = (
6045 END
6046             $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6047         }
6048
6049         if ($has_hangul_syllables || @code_points_ending_in_code_point) {
6050
6051             # Convert these structures to output format.
6052             my $code_points_ending_in_code_point =
6053                 main::simple_dumper(\@code_points_ending_in_code_point,
6054                                     ' ' x 8);
6055             my $names = main::simple_dumper(\%names_ending_in_code_point,
6056                                             ' ' x 8);
6057             my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
6058                                             ' ' x 8);
6059
6060             # Do the same with the Hangul names,
6061             my $jamo;
6062             my $jamo_l;
6063             my $jamo_v;
6064             my $jamo_t;
6065             my $jamo_re;
6066             if ($has_hangul_syllables) {
6067
6068                 # Construct a regular expression of all the possible
6069                 # combinations of the Hangul syllables.
6070                 my @L_re;   # Leading consonants
6071                 for my $i ($LBase .. $LBase + $LCount - 1) {
6072                     push @L_re, $Jamo{$i}
6073                 }
6074                 my @V_re;   # Middle vowels
6075                 for my $i ($VBase .. $VBase + $VCount - 1) {
6076                     push @V_re, $Jamo{$i}
6077                 }
6078                 my @T_re;   # Trailing consonants
6079                 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
6080                     push @T_re, $Jamo{$i}
6081                 }
6082
6083                 # The whole re is made up of the L V T combination.
6084                 $jamo_re = '('
6085                             . join ('|', sort @L_re)
6086                             . ')('
6087                             . join ('|', sort @V_re)
6088                             . ')('
6089                             . join ('|', sort @T_re)
6090                             . ')?';
6091
6092                 # These hashes needed by the algorithm were generated
6093                 # during reading of the Jamo.txt file
6094                 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
6095                 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
6096                 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
6097                 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
6098             }
6099
6100             $pre_body .= <<END;
6101
6102 # To achieve significant memory savings when this file is read in,
6103 # algorithmically derivable code points are omitted from the main body below.
6104 # Instead, the following routines can be used to translate between name and
6105 # code point and vice versa
6106
6107 { # Closure
6108
6109     # Matches legal code point.  4-6 hex numbers, If there are 6, the
6110     # first two must be '10'; if there are 5, the first must not be a '0'.
6111     # First can match at the end of a word provided that the end of the
6112     # word doesn't look like a hex number.
6113     my \$run_on_code_point_re = qr/$run_on_code_point_re/;
6114     my \$code_point_re = qr/$code_point_re/;
6115
6116     # In the following hash, the keys are the bases of names which includes
6117     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The values
6118     # of each key is another hash which is used to get the low and high ends
6119     # for each range of code points that apply to the name.
6120     my %names_ending_in_code_point = (
6121 $names
6122     );
6123
6124     # The following hash is a copy of the previous one, except is for loose
6125     # matching, so each name has blanks and dashes squeezed out
6126     my %loose_names_ending_in_code_point = (
6127 $loose_names
6128     );
6129
6130     # And the following array gives the inverse mapping from code points to
6131     # names.  Lowest code points are first
6132     my \@code_points_ending_in_code_point = (
6133 $code_points_ending_in_code_point
6134     );
6135 END
6136             # Earlier releases didn't have Jamos.  No sense outputting
6137             # them unless will be used.
6138             if ($has_hangul_syllables) {
6139                 $pre_body .= <<END;
6140
6141     # Convert from code point to Jamo short name for use in composing Hangul
6142     # syllable names
6143     my %Jamo = (
6144 $jamo
6145     );
6146
6147     # Leading consonant (can be null)
6148     my %Jamo_L = (
6149 $jamo_l
6150     );
6151
6152     # Vowel
6153     my %Jamo_V = (
6154 $jamo_v
6155     );
6156
6157     # Optional trailing consonant
6158     my %Jamo_T = (
6159 $jamo_t
6160     );
6161
6162     # Computed re that splits up a Hangul name into LVT or LV syllables
6163     my \$syllable_re = qr/$jamo_re/;
6164
6165     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
6166     my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
6167
6168     # These constants names and values were taken from the Unicode standard,
6169     # version 5.1, section 3.12.  They are used in conjunction with Hangul
6170     # syllables
6171     my \$SBase = $SBase_string;
6172     my \$LBase = $LBase_string;
6173     my \$VBase = $VBase_string;
6174     my \$TBase = $TBase_string;
6175     my \$SCount = $SCount;
6176     my \$LCount = $LCount;
6177     my \$VCount = $VCount;
6178     my \$TCount = $TCount;
6179     my \$NCount = \$VCount * \$TCount;
6180 END
6181             } # End of has Jamos
6182
6183             $pre_body .= << 'END';
6184
6185     sub name_to_code_point_special {
6186         my ($name, $loose) = @_;
6187
6188         # Returns undef if not one of the specially handled names; otherwise
6189         # returns the code point equivalent to the input name
6190         # $loose is non-zero if to use loose matching, 'name' in that case
6191         # must be input as upper case with all blanks and dashes squeezed out.
6192 END
6193             if ($has_hangul_syllables) {
6194                 $pre_body .= << 'END';
6195
6196         if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
6197             || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
6198         {
6199             return if $name !~ qr/^$syllable_re$/;
6200             my $L = $Jamo_L{$1};
6201             my $V = $Jamo_V{$2};
6202             my $T = (defined $3) ? $Jamo_T{$3} : 0;
6203             return ($L * $VCount + $V) * $TCount + $T + $SBase;
6204         }
6205 END
6206             }
6207             $pre_body .= << 'END';
6208
6209         # Name must end in 'code_point' for this to handle.
6210         return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
6211                    || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
6212
6213         my $base = $1;
6214         my $code_point = CORE::hex $2;
6215         my $names_ref;
6216
6217         if ($loose) {
6218             $names_ref = \%loose_names_ending_in_code_point;
6219         }
6220         else {
6221             return if $base !~ s/-$//;
6222             $names_ref = \%names_ending_in_code_point;
6223         }
6224
6225         # Name must be one of the ones which has the code point in it.
6226         return if ! $names_ref->{$base};
6227
6228         # Look through the list of ranges that apply to this name to see if
6229         # the code point is in one of them.
6230         for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
6231             return if $names_ref->{$base}{'low'}->[$i] > $code_point;
6232             next if $names_ref->{$base}{'high'}->[$i] < $code_point;
6233
6234             # Here, the code point is in the range.
6235             return $code_point;
6236         }
6237
6238         # Here, looked like the name had a code point number in it, but
6239         # did not match one of the valid ones.
6240         return;
6241     }
6242
6243     sub code_point_to_name_special {
6244         my $code_point = shift;
6245
6246         # Returns the name of a code point if algorithmically determinable;
6247         # undef if not
6248 END
6249             if ($has_hangul_syllables) {
6250                 $pre_body .= << 'END';
6251
6252         # If in the Hangul range, calculate the name based on Unicode's
6253         # algorithm
6254         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
6255             use integer;
6256             my $SIndex = $code_point - $SBase;
6257             my $L = $LBase + $SIndex / $NCount;
6258             my $V = $VBase + ($SIndex % $NCount) / $TCount;
6259             my $T = $TBase + $SIndex % $TCount;
6260             $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
6261             $name .= $Jamo{$T} if $T != $TBase;
6262             return $name;
6263         }
6264 END
6265             }
6266             $pre_body .= << 'END';
6267
6268         # Look through list of these code points for one in range.
6269         foreach my $hash (@code_points_ending_in_code_point) {
6270             return if $code_point < $hash->{'low'};
6271             if ($code_point <= $hash->{'high'}) {
6272                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
6273             }
6274         }
6275         return;            # None found
6276     }
6277 } # End closure
6278
6279 END
6280         } # End of has hangul or code point in name maps.
6281
6282         my $format = $self->format;
6283
6284         my $return = <<END;
6285 # The name this swash is to be known by, with the format of the mappings in
6286 # the main body of the table, and what all code points missing from this file
6287 # map to.
6288 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6289 END
6290         if ($specials_name) {
6291         $return .= <<END;
6292 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6293 END
6294         }
6295         my $default_map = $default_map{$addr};
6296         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
6297
6298         if ($default_map eq $CODE_POINT) {
6299             $return .= ' # code point maps to itself';
6300         }
6301         elsif ($default_map eq "") {
6302             $return .= ' # code point maps to the null string';
6303         }
6304         $return .= "\n";
6305
6306         $return .= $pre_body;
6307
6308         return $return;
6309     }
6310
6311     sub write {
6312         # Write the table to the file.
6313
6314         my $self = shift;
6315         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6316
6317         my $addr = do { no overloading; pack 'J', $self; };
6318
6319         # Clear the temporaries
6320         $has_hangul_syllables = 0;
6321         undef @multi_code_point_maps;
6322         undef %names_ending_in_code_point;
6323         undef %loose_names_ending_in_code_point;
6324         undef @code_points_ending_in_code_point;
6325
6326         # Calculate the format of the table if not already done.
6327         my $format = $self->format;
6328         my $type = $self->property->type;
6329         my $default_map = $self->default_map;
6330         if (! defined $format) {
6331             if ($type == $BINARY) {
6332
6333                 # Don't bother checking the values, because we elsewhere
6334                 # verify that a binary table has only 2 values.
6335                 $format = $BINARY_FORMAT;
6336             }
6337             else {
6338                 my @ranges = $self->_range_list->ranges;
6339
6340                 # default an empty table based on its type and default map
6341                 if (! @ranges) {
6342
6343                     # But it turns out that the only one we can say is a
6344                     # non-string (besides binary, handled above) is when the
6345                     # table is a string and the default map is to a code point
6346                     if ($type == $STRING && $default_map eq $CODE_POINT) {
6347                         $format = $HEX_FORMAT;
6348                     }
6349                     else {
6350                         $format = $STRING_FORMAT;
6351                     }
6352                 }
6353                 else {
6354
6355                     # Start with the most restrictive format, and as we find
6356                     # something that doesn't fit with that, change to the next
6357                     # most restrictive, and so on.
6358                     $format = $DECIMAL_FORMAT;
6359                     foreach my $range (@ranges) {
6360                         next if $range->type != 0;  # Non-normal ranges don't
6361                                                     # affect the main body
6362                         my $map = $range->value;
6363                         if ($map ne $default_map) {
6364                             last if $format eq $STRING_FORMAT;  # already at
6365                                                                 # least
6366                                                                 # restrictive
6367                             $format = $INTEGER_FORMAT
6368                                                 if $format eq $DECIMAL_FORMAT
6369                                                     && $map !~ / ^ [0-9] $ /x;
6370                             $format = $FLOAT_FORMAT
6371                                             if $format eq $INTEGER_FORMAT
6372                                                 && $map !~ / ^ -? [0-9]+ $ /x;
6373                             $format = $RATIONAL_FORMAT
6374                                 if $format eq $FLOAT_FORMAT
6375                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6376                             $format = $HEX_FORMAT
6377                             if $format eq $RATIONAL_FORMAT
6378                                 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
6379                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6380                                                        && $map =~ /[^0-9A-F]/;
6381                         }
6382                     }
6383                 }
6384             }
6385         } # end of calculating format
6386
6387         if ($default_map eq $CODE_POINT
6388             && $format ne $HEX_FORMAT
6389             && ! defined $self->format)    # manual settings are always
6390                                            # considered ok
6391         {
6392             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6393         }
6394
6395         $self->_set_format($format);
6396
6397         # Core Perl has a different definition of mapping ranges than we do,
6398         # that is applicable mainly to mapping code points, so for tables
6399         # where it is possible that core Perl could be used to read it,
6400         # make it range size 1 to prevent possible confusion
6401         $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
6402
6403         return $self->SUPER::write(
6404             ($self->property == $block)
6405                 ? 7     # block file needs more tab stops
6406                 : 3,
6407             $default_map);   # don't write defaulteds
6408     }
6409
6410     # Accessors for the underlying list that should fail if locked.
6411     for my $sub (qw(
6412                     add_duplicate
6413                 ))
6414     {
6415         no strict "refs";
6416         *$sub = sub {
6417             use strict "refs";
6418             my $self = shift;
6419
6420             return if $self->carp_if_locked;
6421             return $self->_range_list->$sub(@_);
6422         }
6423     }
6424 } # End closure for Map_Table
6425
6426 package Match_Table;
6427 use base '_Base_Table';
6428
6429 # A Match table is one which is a list of all the code points that have
6430 # the same property and property value, for use in \p{property=value}
6431 # constructs in regular expressions.  It adds very little data to the base
6432 # structure, but many methods, as these lists can be combined in many ways to
6433 # form new ones.
6434 # There are only a few concepts added:
6435 # 1) Equivalents and Relatedness.
6436 #    Two tables can match the identical code points, but have different names.
6437 #    This always happens when there is a perl single form extension
6438 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
6439 #    tables are set to be related, with the Perl extension being a child, and
6440 #    the Unicode property being the parent.
6441 #
6442 #    It may be that two tables match the identical code points and we don't
6443 #    know if they are related or not.  This happens most frequently when the
6444 #    Block and Script properties have the exact range.  But note that a
6445 #    revision to Unicode could add new code points to the script, which would
6446 #    now have to be in a different block (as the block was filled, or there
6447 #    would have been 'Unknown' script code points in it and they wouldn't have
6448 #    been identical).  So we can't rely on any two properties from Unicode
6449 #    always matching the same code points from release to release, and thus
6450 #    these tables are considered coincidentally equivalent--not related.  When
6451 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
6452 #    'leader', and the others are 'equivalents'.  This concept is useful
6453 #    to minimize the number of tables written out.  Only one file is used for
6454 #    any identical set of code points, with entries in Heavy.pl mapping all
6455 #    the involved tables to it.
6456 #
6457 #    Related tables will always be identical; we set them up to be so.  Thus
6458 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
6459 #    unrelated tables.  Relatedness makes generating the documentation easier.
6460 #
6461 # 2) Complement.
6462 #    Like equivalents, two tables may be the inverses of each other, the
6463 #    intersection between them is null, and the union is every Unicode code
6464 #    point.  The two tables that occupy a binary property are necessarily like
6465 #    this.  By specifying one table as the complement of another, we can avoid
6466 #    storing it on disk (using the other table and performing a fast
6467 #    transform), and some memory and calculations.
6468 #
6469 # 3) Conflicting.  It may be that there will eventually be name clashes, with
6470 #    the same name meaning different things.  For a while, there actually were
6471 #    conflicts, but they have so far been resolved by changing Perl's or
6472 #    Unicode's definitions to match the other, but when this code was written,
6473 #    it wasn't clear that that was what was going to happen.  (Unicode changed
6474 #    because of protests during their beta period.)  Name clashes are warned
6475 #    about during compilation, and the documentation.  The generated tables
6476 #    are sane, free of name clashes, because the code suppresses the Perl
6477 #    version.  But manual intervention to decide what the actual behavior
6478 #    should be may be required should this happen.  The introductory comments
6479 #    have more to say about this.
6480
6481 sub standardize { return main::standardize($_[0]); }
6482 sub trace { return main::trace(@_); }
6483
6484
6485 { # Closure
6486
6487     main::setup_package();
6488
6489     my %leader;
6490     # The leader table of this one; initially $self.
6491     main::set_access('leader', \%leader, 'r');
6492
6493     my %equivalents;
6494     # An array of any tables that have this one as their leader
6495     main::set_access('equivalents', \%equivalents, 'readable_array');
6496
6497     my %parent;
6498     # The parent table to this one, initially $self.  This allows us to
6499     # distinguish between equivalent tables that are related (for which this
6500     # is set to), and those which may not be, but share the same output file
6501     # because they match the exact same set of code points in the current
6502     # Unicode release.
6503     main::set_access('parent', \%parent, 'r');
6504
6505     my %children;
6506     # An array of any tables that have this one as their parent
6507     main::set_access('children', \%children, 'readable_array');
6508
6509     my %conflicting;
6510     # Array of any tables that would have the same name as this one with
6511     # a different meaning.  This is used for the generated documentation.
6512     main::set_access('conflicting', \%conflicting, 'readable_array');
6513
6514     my %matches_all;
6515     # Set in the constructor for tables that are expected to match all code
6516     # points.
6517     main::set_access('matches_all', \%matches_all, 'r');
6518
6519     my %complement;
6520     # Points to the complement that this table is expressed in terms of; 0 if
6521     # none.
6522     main::set_access('complement', \%complement, 'r', 's' );
6523
6524     sub new {
6525         my $class = shift;
6526
6527         my %args = @_;
6528
6529         # The property for which this table is a listing of property values.
6530         my $property = delete $args{'_Property'};
6531
6532         my $name = delete $args{'Name'};
6533         my $full_name = delete $args{'Full_Name'};
6534         $full_name = $name if ! defined $full_name;
6535
6536         # Optional
6537         my $initialize = delete $args{'Initialize'};
6538         my $matches_all = delete $args{'Matches_All'} || 0;
6539         my $format = delete $args{'Format'};
6540         # Rest of parameters passed on.
6541
6542         my $range_list = Range_List->new(Initialize => $initialize,
6543                                          Owner => $property);
6544
6545         my $complete = $full_name;
6546         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
6547                                               # but this helps debug if it
6548                                               # does
6549         # The complete name for a match table includes it's property in a
6550         # compound form 'property=table', except if the property is the
6551         # pseudo-property, perl, in which case it is just the single form,
6552         # 'table' (If you change the '=' must also change the ':' in lots of
6553         # places in this program that assume an equal sign)
6554         $complete = $property->full_name . "=$complete" if $property != $perl;
6555
6556         my $self = $class->SUPER::new(%args,
6557                                       Name => $name,
6558                                       Complete_Name => $complete,
6559                                       Full_Name => $full_name,
6560                                       _Property => $property,
6561                                       _Range_List => $range_list,
6562                                       Format => $EMPTY_FORMAT,
6563                                       );
6564         my $addr = do { no overloading; pack 'J', $self; };
6565
6566         $conflicting{$addr} = [ ];
6567         $equivalents{$addr} = [ ];
6568         $children{$addr} = [ ];
6569         $matches_all{$addr} = $matches_all;
6570         $leader{$addr} = $self;
6571         $parent{$addr} = $self;
6572         $complement{$addr} = 0;
6573
6574         if (defined $format && $format ne $EMPTY_FORMAT) {
6575             Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
6576         }
6577
6578         return $self;
6579     }
6580
6581     # See this program's beginning comment block about overloading these.
6582     use overload
6583         fallback => 0,
6584         qw("") => "_operator_stringify",
6585         '=' => sub {
6586                     my $self = shift;
6587
6588                     return if $self->carp_if_locked;
6589                     return $self;
6590                 },
6591
6592         '+' => sub {
6593                         my $self = shift;
6594                         my $other = shift;
6595
6596                         return $self->_range_list + $other;
6597                     },
6598         '&' => sub {
6599                         my $self = shift;
6600                         my $other = shift;
6601
6602                         return $self->_range_list & $other;
6603                     },
6604         '+=' => sub {
6605                         my $self = shift;
6606                         my $other = shift;
6607
6608                         return if $self->carp_if_locked;
6609
6610                         my $addr = do { no overloading; pack 'J', $self; };
6611
6612                         if (ref $other) {
6613
6614                             # Change the range list of this table to be the
6615                             # union of the two.
6616                             $self->_set_range_list($self->_range_list
6617                                                     + $other);
6618                         }
6619                         else {    # $other is just a simple value
6620                             $self->add_range($other, $other);
6621                         }
6622                         return $self;
6623                     },
6624         '-' => sub { my $self = shift;
6625                     my $other = shift;
6626                     my $reversed = shift;
6627
6628                     if ($reversed) {
6629                         Carp::my_carp_bug("Can't cope with a "
6630                             .  __PACKAGE__
6631                             . " being the first parameter in a '-'.  Subtraction ignored.");
6632                         return;
6633                     }
6634
6635                     return $self->_range_list - $other;
6636                 },
6637         '~' => sub { my $self = shift;
6638                     return ~ $self->_range_list;
6639                 },
6640     ;
6641
6642     sub _operator_stringify {
6643         my $self = shift;
6644
6645         my $name = $self->complete_name;
6646         return "Table '$name'";
6647     }
6648
6649     sub add_alias {
6650         # Add a synonym for this table.  See the comments in the base class
6651
6652         my $self = shift;
6653         my $name = shift;
6654         # Rest of parameters passed on.
6655
6656         $self->SUPER::add_alias($name, $self, @_);
6657         return;
6658     }
6659
6660     sub add_conflicting {
6661         # Add the name of some other object to the list of ones that name
6662         # clash with this match table.
6663
6664         my $self = shift;
6665         my $conflicting_name = shift;   # The name of the conflicting object
6666         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
6667         my $conflicting_object = shift; # Optional, the conflicting object
6668                                         # itself.  This is used to
6669                                         # disambiguate the text if the input
6670                                         # name is identical to any of the
6671                                         # aliases $self is known by.
6672                                         # Sometimes the conflicting object is
6673                                         # merely hypothetical, so this has to
6674                                         # be an optional parameter.
6675         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6676
6677         my $addr = do { no overloading; pack 'J', $self; };
6678
6679         # Check if the conflicting name is exactly the same as any existing
6680         # alias in this table (as long as there is a real object there to
6681         # disambiguate with).
6682         if (defined $conflicting_object) {
6683             foreach my $alias ($self->aliases) {
6684                 if ($alias->name eq $conflicting_name) {
6685
6686                     # Here, there is an exact match.  This results in
6687                     # ambiguous comments, so disambiguate by changing the
6688                     # conflicting name to its object's complete equivalent.
6689                     $conflicting_name = $conflicting_object->complete_name;
6690                     last;
6691                 }
6692             }
6693         }
6694
6695         # Convert to the \p{...} final name
6696         $conflicting_name = "\\$p" . "{$conflicting_name}";
6697
6698         # Only add once
6699         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6700
6701         push @{$conflicting{$addr}}, $conflicting_name;
6702
6703         return;
6704     }
6705
6706     sub is_set_equivalent_to {
6707         # Return boolean of whether or not the other object is a table of this
6708         # type and has been marked equivalent to this one.
6709
6710         my $self = shift;
6711         my $other = shift;
6712         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6713
6714         return 0 if ! defined $other; # Can happen for incomplete early
6715                                       # releases
6716         unless ($other->isa(__PACKAGE__)) {
6717             my $ref_other = ref $other;
6718             my $ref_self = ref $self;
6719             Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
6720             return 0;
6721         }
6722
6723         # Two tables are equivalent if they have the same leader.
6724         no overloading;
6725         return $leader{pack 'J', $self} == $leader{pack 'J', $other};
6726         return;
6727     }
6728
6729     sub set_equivalent_to {
6730         # Set $self equivalent to the parameter table.
6731         # The required Related => 'x' parameter is a boolean indicating
6732         # whether these tables are related or not.  If related, $other becomes
6733         # the 'parent' of $self; if unrelated it becomes the 'leader'
6734         #
6735         # Related tables share all characteristics except names; equivalents
6736         # not quite so many.
6737         # If they are related, one must be a perl extension.  This is because
6738         # we can't guarantee that Unicode won't change one or the other in a
6739         # later release even if they are identical now.
6740
6741         my $self = shift;
6742         my $other = shift;
6743
6744         my %args = @_;
6745         my $related = delete $args{'Related'};
6746
6747         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6748
6749         return if ! defined $other;     # Keep on going; happens in some early
6750                                         # Unicode releases.
6751
6752         if (! defined $related) {
6753             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
6754             $related = 0;
6755         }
6756
6757         # If already are equivalent, no need to re-do it;  if subroutine
6758         # returns null, it found an error, also do nothing
6759         my $are_equivalent = $self->is_set_equivalent_to($other);
6760         return if ! defined $are_equivalent || $are_equivalent;
6761
6762         my $addr = do { no overloading; pack 'J', $self; };
6763         my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
6764
6765         if ($related) {
6766             if ($current_leader->perl_extension) {
6767                 if ($other->perl_extension) {
6768                     Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
6769                     return;
6770                 }
6771             } elsif (! $other->perl_extension) {
6772                 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
6773                 $related = 0;
6774             }
6775         }
6776
6777         if (! $self->is_empty && ! $self->matches_identically_to($other)) {
6778             Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
6779             return;
6780         }
6781
6782         my $leader = do { no overloading; pack 'J', $current_leader; };
6783         my $other_addr = do { no overloading; pack 'J', $other; };
6784
6785         # Any tables that are equivalent to or children of this table must now
6786         # instead be equivalent to or (children) to the new leader (parent),
6787         # still equivalent.  The equivalency includes their matches_all info,
6788         # and for related tables, their status
6789         # All related tables are of necessity equivalent, but the converse
6790         # isn't necessarily true
6791         my $status = $other->status;
6792         my $status_info = $other->status_info;
6793         my $matches_all = $matches_all{other_addr};
6794         my $caseless_equivalent = $other->caseless_equivalent;
6795         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6796             next if $table == $other;
6797             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6798
6799             my $table_addr = do { no overloading; pack 'J', $table; };
6800             $leader{$table_addr} = $other;
6801             $matches_all{$table_addr} = $matches_all;
6802             $self->_set_range_list($other->_range_list);
6803             push @{$equivalents{$other_addr}}, $table;
6804             if ($related) {
6805                 $parent{$table_addr} = $other;
6806                 push @{$children{$other_addr}}, $table;
6807                 $table->set_status($status, $status_info);
6808                 $self->set_caseless_equivalent($caseless_equivalent);
6809             }
6810         }
6811
6812         # Now that we've declared these to be equivalent, any changes to one
6813         # of the tables would invalidate that equivalency.
6814         $self->lock;
6815         $other->lock;
6816         return;
6817     }
6818
6819     sub add_range { # Add a range to the list for this table.
6820         my $self = shift;
6821         # Rest of parameters passed on
6822
6823         return if $self->carp_if_locked;
6824         return $self->_range_list->add_range(@_);
6825     }
6826
6827     sub pre_body {  # Does nothing for match tables.
6828         return
6829     }
6830
6831     sub append_to_body {  # Does nothing for match tables.
6832         return
6833     }
6834
6835     sub write {
6836         my $self = shift;
6837         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6838
6839         return $self->SUPER::write(2); # 2 tab stops
6840     }
6841
6842     sub set_final_comment {
6843         # This creates a comment for the file that is to hold the match table
6844         # $self.  It is somewhat convoluted to make the English read nicely,
6845         # but, heh, it's just a comment.
6846         # This should be called only with the leader match table of all the
6847         # ones that share the same file.  It lists all such tables, ordered so
6848         # that related ones are together.
6849
6850         return unless $debugging_build;
6851
6852         my $leader = shift;   # Should only be called on the leader table of
6853                               # an equivalent group
6854         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6855
6856         my $addr = do { no overloading; pack 'J', $leader; };
6857
6858         if ($leader{$addr} != $leader) {
6859             Carp::my_carp_bug(<<END
6860 set_final_comment() must be called on a leader table, which $leader is not.
6861 It is equivalent to $leader{$addr}.  No comment created
6862 END
6863             );
6864             return;
6865         }
6866
6867         # Get the number of code points matched by each of the tables in this
6868         # file, and add underscores for clarity.
6869         my $count = $leader->count;
6870         my $string_count = main::clarify_number($count);
6871
6872         my $loose_count = 0;        # how many aliases loosely matched
6873         my $compound_name = "";     # ? Are any names compound?, and if so, an
6874                                     # example
6875         my $properties_with_compound_names = 0;    # count of these
6876
6877
6878         my %flags;              # The status flags used in the file
6879         my $total_entries = 0;  # number of entries written in the comment
6880         my $matches_comment = ""; # The portion of the comment about the
6881                                   # \p{}'s
6882         my @global_comments;    # List of all the tables' comments that are
6883                                 # there before this routine was called.
6884
6885         # Get list of all the parent tables that are equivalent to this one
6886         # (including itself).
6887         my @parents = grep { $parent{main::objaddr $_} == $_ }
6888                             main::uniques($leader, @{$equivalents{$addr}});
6889         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
6890                                               # tables
6891
6892         for my $parent (@parents) {
6893
6894             my $property = $parent->property;
6895
6896             # Special case 'N' tables in properties with two match tables when
6897             # the other is a 'Y' one.  These are likely to be binary tables,
6898             # but not necessarily.  In either case, \P{} will match the
6899             # complement of \p{}, and so if something is a synonym of \p, the
6900             # complement of that something will be the synonym of \P.  This
6901             # would be true of any property with just two match tables, not
6902             # just those whose values are Y and N; but that would require a
6903             # little extra work, and there are none such so far in Unicode.
6904             my $perl_p = 'p';        # which is it?  \p{} or \P{}
6905             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
6906
6907             if (scalar $property->tables == 2
6908                 && $parent == $property->table('N')
6909                 && defined (my $yes = $property->table('Y')))
6910             {
6911                 my $yes_addr = do { no overloading; pack 'J', $yes; };
6912                 @yes_perl_synonyms
6913                     = grep { $_->property == $perl }
6914                                     main::uniques($yes,
6915                                                 $parent{$yes_addr},
6916                                                 $parent{$yes_addr}->children);
6917
6918                 # But these synonyms are \P{} ,not \p{}
6919                 $perl_p = 'P';
6920             }
6921
6922             my @description;        # Will hold the table description
6923             my @note;               # Will hold the table notes.
6924             my @conflicting;        # Will hold the table conflicts.
6925
6926             # Look at the parent, any yes synonyms, and all the children
6927             my $parent_addr = do { no overloading; pack 'J', $parent; };
6928             for my $table ($parent,
6929                            @yes_perl_synonyms,
6930                            @{$children{$parent_addr}})
6931             {
6932                 my $table_addr = do { no overloading; pack 'J', $table; };
6933                 my $table_property = $table->property;
6934
6935                 # Tables are separated by a blank line to create a grouping.
6936                 $matches_comment .= "\n" if $matches_comment;
6937
6938                 # The table is named based on the property and value
6939                 # combination it is for, like script=greek.  But there may be
6940                 # a number of synonyms for each side, like 'sc' for 'script',
6941                 # and 'grek' for 'greek'.  Any combination of these is a valid
6942                 # name for this table.  In this case, there are three more,
6943                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
6944                 # listing all possible combinations in the comment, we make
6945                 # sure that each synonym occurs at least once, and add
6946                 # commentary that the other combinations are possible.
6947                 # Because regular expressions don't recognize things like
6948                 # \p{jsn=}, only look at non-null right-hand-sides
6949                 my @property_aliases = $table_property->aliases;
6950                 my @table_aliases = grep { $_->name ne "" } $table->aliases;
6951
6952                 # The alias lists above are already ordered in the order we
6953                 # want to output them.  To ensure that each synonym is listed,
6954                 # we must use the max of the two numbers.  But if there are no
6955                 # legal synonyms (nothing in @table_aliases), then we don't
6956                 # list anything.
6957                 my $listed_combos = (@table_aliases)
6958                                     ?  main::max(scalar @table_aliases,
6959                                                  scalar @property_aliases)
6960                                     : 0;
6961                 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6962
6963
6964                 my $property_had_compound_name = 0;
6965
6966                 for my $i (0 .. $listed_combos - 1) {
6967                     $total_entries++;
6968
6969                     # The current alias for the property is the next one on
6970                     # the list, or if beyond the end, start over.  Similarly
6971                     # for the table (\p{prop=table})
6972                     my $property_alias = $property_aliases
6973                                             [$i % @property_aliases]->name;
6974                     my $table_alias_object = $table_aliases
6975                                                         [$i % @table_aliases];
6976                     my $table_alias = $table_alias_object->name;
6977                     my $loose_match = $table_alias_object->loose_match;
6978
6979                     if ($table_alias !~ /\D/) { # Clarify large numbers.
6980                         $table_alias = main::clarify_number($table_alias)
6981                     }
6982
6983                     # Add a comment for this alias combination
6984                     my $current_match_comment;
6985                     if ($table_property == $perl) {
6986                         $current_match_comment = "\\$perl_p"
6987                                                     . "{$table_alias}";
6988                     }
6989                     else {
6990                         $current_match_comment
6991                                         = "\\p{$property_alias=$table_alias}";
6992                         $property_had_compound_name = 1;
6993                     }
6994
6995                     # Flag any abnormal status for this table.
6996                     my $flag = $property->status
6997                                 || $table->status
6998                                 || $table_alias_object->status;
6999                     if ($flag) {
7000                         if ($flag ne $PLACEHOLDER) {
7001                             $flags{$flag} = $status_past_participles{$flag};
7002                         } else {
7003                             $flags{$flag} = <<END;
7004 a placeholder because it is not in Version $string_version of Unicode, but is
7005 needed by the Perl core to work gracefully.  Because it is not in this version
7006 of Unicode, it will not be listed in $pod_file.pod
7007 END
7008                         }
7009                     }
7010
7011                     $loose_count++;
7012
7013                     # Pretty up the comment.  Note the \b; it says don't make
7014                     # this line a continuation.
7015                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
7016                                         $flag,
7017                                         " " x 7,
7018                                         $current_match_comment);
7019                 } # End of generating the entries for this table.
7020
7021                 # Save these for output after this group of related tables.
7022                 push @description, $table->description;
7023                 push @note, $table->note;
7024                 push @conflicting, $table->conflicting;
7025
7026                 # And this for output after all the tables.
7027                 push @global_comments, $table->comment;
7028
7029                 # Compute an alternate compound name using the final property
7030                 # synonym and the first table synonym with a colon instead of
7031                 # the equal sign used elsewhere.
7032                 if ($property_had_compound_name) {
7033                     $properties_with_compound_names ++;
7034                     if (! $compound_name || @property_aliases > 1) {
7035                         $compound_name = $property_aliases[-1]->name
7036                                         . ': '
7037                                         . $table_aliases[0]->name;
7038                     }
7039                 }
7040             } # End of looping through all children of this table
7041
7042             # Here have assembled in $matches_comment all the related tables
7043             # to the current parent (preceded by the same info for all the
7044             # previous parents).  Put out information that applies to all of
7045             # the current family.
7046             if (@conflicting) {
7047
7048                 # But output the conflicting information now, as it applies to
7049                 # just this table.
7050                 my $conflicting = join ", ", @conflicting;
7051                 if ($conflicting) {
7052                     $matches_comment .= <<END;
7053
7054     Note that contrary to what you might expect, the above is NOT the same as
7055 END
7056                     $matches_comment .= "any of: " if @conflicting > 1;
7057                     $matches_comment .= "$conflicting\n";
7058                 }
7059             }
7060             if (@description) {
7061                 $matches_comment .= "\n    Meaning: "
7062                                     . join('; ', @description)
7063                                     . "\n";
7064             }
7065             if (@note) {
7066                 $matches_comment .= "\n    Note: "
7067                                     . join("\n    ", @note)
7068                                     . "\n";
7069             }
7070         } # End of looping through all tables
7071
7072
7073         my $code_points;
7074         my $match;
7075         my $any_of_these;
7076         if ($count == 1) {
7077             $match = 'matches';
7078             $code_points = 'single code point';
7079         }
7080         else {
7081             $match = 'match';
7082             $code_points = "$string_count code points";
7083         }
7084
7085         my $synonyms;
7086         my $entries;
7087         if ($total_entries == 1) {
7088             $synonyms = "";
7089             $entries = 'entry';
7090             $any_of_these = 'this'
7091         }
7092         else {
7093             $synonyms = " any of the following regular expression constructs";
7094             $entries = 'entries';
7095             $any_of_these = 'any of these'
7096         }
7097
7098         my $comment = "";
7099         if ($has_unrelated) {
7100             $comment .= <<END;
7101 This file is for tables that are not necessarily related:  To conserve
7102 resources, every table that matches the identical set of code points in this
7103 version of Unicode uses this file.  Each one is listed in a separate group
7104 below.  It could be that the tables will match the same set of code points in
7105 other Unicode releases, or it could be purely coincidence that they happen to
7106 be the same in Unicode $string_version, and hence may not in other versions.
7107
7108 END
7109         }
7110
7111         if (%flags) {
7112             foreach my $flag (sort keys %flags) {
7113                 $comment .= <<END;
7114 '$flag' below means that this form is $flags{$flag}.
7115 END
7116                 next if $flag eq $PLACEHOLDER;
7117                 $comment .= "Consult $pod_file.pod\n";
7118             }
7119             $comment .= "\n";
7120         }
7121
7122         if ($total_entries == 0) {
7123             Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
7124             $comment .= <<END;
7125 This file returns the $code_points in Unicode Version $string_version for
7126 $leader, but it is inaccessible through Perl regular expressions, as
7127 "\\p{prop=}" is not recognized.
7128 END
7129
7130         } else {
7131             $comment .= <<END;
7132 This file returns the $code_points in Unicode Version $string_version that
7133 $match$synonyms:
7134
7135 $matches_comment
7136 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
7137 including if adding or subtracting white space, underscore, and hyphen
7138 characters matters or doesn't matter, and other permissible syntactic
7139 variants.  Upper/lower case distinctions never matter.
7140 END
7141
7142         }
7143         if ($compound_name) {
7144             $comment .= <<END;
7145
7146 A colon can be substituted for the equals sign, and
7147 END
7148             if ($properties_with_compound_names > 1) {
7149                 $comment .= <<END;
7150 within each group above,
7151 END
7152             }
7153             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7154
7155             # Note the \b below, it says don't make that line a continuation.
7156             $comment .= <<END;
7157 anything to the left of the equals (or colon) can be combined with anything to
7158 the right.  Thus, for example,
7159 $compound_name
7160 \bis also valid.
7161 END
7162         }
7163
7164         # And append any comment(s) from the actual tables.  They are all
7165         # gathered here, so may not read all that well.
7166         if (@global_comments) {
7167             $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7168         }
7169
7170         if ($count) {   # The format differs if no code points, and needs no
7171                         # explanation in that case
7172                 $comment.= <<END;
7173
7174 The format of the lines of this file is:
7175 END
7176             $comment.= <<END;
7177 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7178 STOP is the ending point, or if omitted, the range has just one code point.
7179 END
7180             if ($leader->output_range_counts) {
7181                 $comment .= <<END;
7182 Numbers in comments in [brackets] indicate how many code points are in the
7183 range.
7184 END
7185             }
7186         }
7187
7188         $leader->set_comment(main::join_lines($comment));
7189         return;
7190     }
7191
7192     # Accessors for the underlying list
7193     for my $sub (qw(
7194                     get_valid_code_point
7195                     get_invalid_code_point
7196                 ))
7197     {
7198         no strict "refs";
7199         *$sub = sub {
7200             use strict "refs";
7201             my $self = shift;
7202
7203             return $self->_range_list->$sub(@_);
7204         }
7205     }
7206 } # End closure for Match_Table
7207
7208 package Property;
7209
7210 # The Property class represents a Unicode property, or the $perl
7211 # pseudo-property.  It contains a map table initialized empty at construction
7212 # time, and for properties accessible through regular expressions, various
7213 # match tables, created through the add_match_table() method, and referenced
7214 # by the table('NAME') or tables() methods, the latter returning a list of all
7215 # of the match tables.  Otherwise table operations implicitly are for the map
7216 # table.
7217 #
7218 # Most of the data in the property is actually about its map table, so it
7219 # mostly just uses that table's accessors for most methods.  The two could
7220 # have been combined into one object, but for clarity because of their
7221 # differing semantics, they have been kept separate.  It could be argued that
7222 # the 'file' and 'directory' fields should be kept with the map table.
7223 #
7224 # Each property has a type.  This can be set in the constructor, or in the
7225 # set_type accessor, but mostly it is figured out by the data.  Every property
7226 # starts with unknown type, overridden by a parameter to the constructor, or
7227 # as match tables are added, or ranges added to the map table, the data is
7228 # inspected, and the type changed.  After the table is mostly or entirely
7229 # filled, compute_type() should be called to finalize they analysis.
7230 #
7231 # There are very few operations defined.  One can safely remove a range from
7232 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7233 # table to this one, replacing any in the intersection of the two.
7234
7235 sub standardize { return main::standardize($_[0]); }
7236 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7237
7238 {   # Closure
7239
7240     # This hash will contain as keys, all the aliases of all properties, and
7241     # as values, pointers to their respective property objects.  This allows
7242     # quick look-up of a property from any of its names.
7243     my %alias_to_property_of;
7244
7245     sub dump_alias_to_property_of {
7246         # For debugging
7247
7248         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7249         return;
7250     }
7251
7252     sub property_ref {
7253         # This is a package subroutine, not called as a method.
7254         # If the single parameter is a literal '*' it returns a list of all
7255         # defined properties.
7256         # Otherwise, the single parameter is a name, and it returns a pointer
7257         # to the corresponding property object, or undef if none.
7258         #
7259         # Properties can have several different names.  The 'standard' form of
7260         # each of them is stored in %alias_to_property_of as they are defined.
7261         # But it's possible that this subroutine will be called with some
7262         # variant, so if the initial lookup fails, it is repeated with the
7263         # standardized form of the input name.  If found, besides returning the
7264         # result, the input name is added to the list so future calls won't
7265         # have to do the conversion again.
7266
7267         my $name = shift;
7268
7269         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7270
7271         if (! defined $name) {
7272             Carp::my_carp_bug("Undefined input property.  No action taken.");
7273             return;
7274         }
7275
7276         return main::uniques(values %alias_to_property_of) if $name eq '*';
7277
7278         # Return cached result if have it.
7279         my $result = $alias_to_property_of{$name};
7280         return $result if defined $result;
7281
7282         # Convert the input to standard form.
7283         my $standard_name = standardize($name);
7284
7285         $result = $alias_to_property_of{$standard_name};
7286         return unless defined $result;        # Don't cache undefs
7287
7288         # Cache the result before returning it.
7289         $alias_to_property_of{$name} = $result;
7290         return $result;
7291     }
7292
7293
7294     main::setup_package();
7295
7296     my %map;
7297     # A pointer to the map table object for this property
7298     main::set_access('map', \%map);
7299
7300     my %full_name;
7301     # The property's full name.  This is a duplicate of the copy kept in the
7302     # map table, but is needed because stringify needs it during
7303     # construction of the map table, and then would have a chicken before egg
7304     # problem.
7305     main::set_access('full_name', \%full_name, 'r');
7306
7307     my %table_ref;
7308     # This hash will contain as keys, all the aliases of any match tables
7309     # attached to this property, and as values, the pointers to their
7310     # respective tables.  This allows quick look-up of a table from any of its
7311     # names.
7312     main::set_access('table_ref', \%table_ref);
7313
7314     my %type;
7315     # The type of the property, $ENUM, $BINARY, etc
7316     main::set_access('type', \%type, 'r');
7317
7318     my %file;
7319     # The filename where the map table will go (if actually written).
7320     # Normally defaulted, but can be overridden.
7321     main::set_access('file', \%file, 'r', 's');
7322
7323     my %directory;
7324     # The directory where the map table will go (if actually written).
7325     # Normally defaulted, but can be overridden.
7326     main::set_access('directory', \%directory, 's');
7327
7328     my %pseudo_map_type;
7329     # This is used to affect the calculation of the map types for all the
7330     # ranges in the table.  It should be set to one of the values that signify
7331     # to alter the calculation.
7332     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7333
7334     my %has_only_code_point_maps;
7335     # A boolean used to help in computing the type of data in the map table.
7336     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7337
7338     my %unique_maps;
7339     # A list of the first few distinct mappings this property has.  This is
7340     # used to disambiguate between binary and enum property types, so don't
7341     # have to keep more than three.
7342     main::set_access('unique_maps', \%unique_maps);
7343
7344     my %pre_declared_maps;
7345     # A boolean that gives whether the input data should declare all the
7346     # tables used, or not.  If the former, unknown ones raise a warning.
7347     main::set_access('pre_declared_maps',
7348                                     \%pre_declared_maps, 'r');
7349
7350     sub new {
7351         # The only required parameter is the positionally first, name.  All
7352         # other parameters are key => value pairs.  See the documentation just
7353         # above for the meanings of the ones not passed directly on to the map
7354         # table constructor.
7355
7356         my $class = shift;
7357         my $name = shift || "";
7358
7359         my $self = property_ref($name);
7360         if (defined $self) {
7361             my $options_string = join ", ", @_;
7362             $options_string = ".  Ignoring options $options_string" if $options_string;
7363             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
7364             return $self;
7365         }
7366
7367         my %args = @_;
7368
7369         $self = bless \do { my $anonymous_scalar }, $class;
7370         my $addr = do { no overloading; pack 'J', $self; };
7371
7372         $directory{$addr} = delete $args{'Directory'};
7373         $file{$addr} = delete $args{'File'};
7374         $full_name{$addr} = delete $args{'Full_Name'} || $name;
7375         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7376         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
7377         $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7378                                     # Starting in this release, property
7379                                     # values should be defined for all
7380                                     # properties, except those overriding this
7381                                     // $v_version ge v5.1.0;
7382
7383         # Rest of parameters passed on.
7384
7385         $has_only_code_point_maps{$addr} = 1;
7386         $table_ref{$addr} = { };
7387         $unique_maps{$addr} = { };
7388
7389         $map{$addr} = Map_Table->new($name,
7390                                     Full_Name => $full_name{$addr},
7391                                     _Alias_Hash => \%alias_to_property_of,
7392                                     _Property => $self,
7393                                     %args);
7394         return $self;
7395     }
7396
7397     # See this program's beginning comment block about overloading the copy
7398     # constructor.  Few operations are defined on properties, but a couple are
7399     # useful.  It is safe to take the inverse of a property, and to remove a
7400     # single code point from it.
7401     use overload
7402         fallback => 0,
7403         qw("") => "_operator_stringify",
7404         "." => \&main::_operator_dot,
7405         '==' => \&main::_operator_equal,
7406         '!=' => \&main::_operator_not_equal,
7407         '=' => sub { return shift },
7408         '-=' => "_minus_and_equal",
7409     ;
7410
7411     sub _operator_stringify {
7412         return "Property '" .  shift->full_name . "'";
7413     }
7414
7415     sub _minus_and_equal {
7416         # Remove a single code point from the map table of a property.
7417
7418         my $self = shift;
7419         my $other = shift;
7420         my $reversed = shift;
7421         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7422
7423         if (ref $other) {
7424             Carp::my_carp_bug("Can't cope with a "
7425                         . ref($other)
7426                         . " argument to '-='.  Subtraction ignored.");
7427             return $self;
7428         }
7429         elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
7430             Carp::my_carp_bug("Can't cope with a "
7431             .  __PACKAGE__
7432             . " being the first parameter in a '-='.  Subtraction ignored.");
7433             return $self;
7434         }
7435         else {
7436             no overloading;
7437             $map{pack 'J', $self}->delete_range($other, $other);
7438         }
7439         return $self;
7440     }
7441
7442     sub add_match_table {
7443         # Add a new match table for this property, with name given by the
7444         # parameter.  It returns a pointer to the table.
7445
7446         my $self = shift;
7447         my $name = shift;
7448         my %args = @_;
7449
7450         my $addr = do { no overloading; pack 'J', $self; };
7451
7452         my $table = $table_ref{$addr}{$name};
7453         my $standard_name = main::standardize($name);
7454         if (defined $table
7455             || (defined ($table = $table_ref{$addr}{$standard_name})))
7456         {
7457             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
7458             $table_ref{$addr}{$name} = $table;
7459             return $table;
7460         }
7461         else {
7462
7463             # See if this is a perl extension, if not passed in.
7464             my $perl_extension = delete $args{'Perl_Extension'};
7465             $perl_extension
7466                         = $self->perl_extension if ! defined $perl_extension;
7467
7468             $table = Match_Table->new(
7469                                 Name => $name,
7470                                 Perl_Extension => $perl_extension,
7471                                 _Alias_Hash => $table_ref{$addr},
7472                                 _Property => $self,
7473
7474                                 # gets property's status by default
7475                                 Status => $self->status,
7476                                 _Status_Info => $self->status_info,
7477                                 %args,
7478                                 Internal_Only_Warning => 1); # Override any
7479                                                              # input param
7480             return unless defined $table;
7481         }
7482
7483         # Save the names for quick look up
7484         $table_ref{$addr}{$standard_name} = $table;
7485         $table_ref{$addr}{$name} = $table;
7486
7487         # Perhaps we can figure out the type of this property based on the
7488         # fact of adding this match table.  First, string properties don't
7489         # have match tables; second, a binary property can't have 3 match
7490         # tables
7491         if ($type{$addr} == $UNKNOWN) {
7492             $type{$addr} = $NON_STRING;
7493         }
7494         elsif ($type{$addr} == $STRING) {
7495             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
7496             $type{$addr} = $NON_STRING;
7497         }
7498         elsif ($type{$addr} != $ENUM) {
7499             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7500                 && $type{$addr} == $BINARY)
7501             {
7502                 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
7503                 $type{$addr} = $ENUM;
7504             }
7505         }
7506
7507         return $table;
7508     }
7509
7510     sub delete_match_table {
7511         # Delete the table referred to by $2 from the property $1.
7512
7513         my $self = shift;
7514         my $table_to_remove = shift;
7515         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7516
7517         my $addr = do { no overloading; pack 'J', $self; };
7518
7519         # Remove all names that refer to it.
7520         foreach my $key (keys %{$table_ref{$addr}}) {
7521             delete $table_ref{$addr}{$key}
7522                                 if $table_ref{$addr}{$key} == $table_to_remove;
7523         }
7524
7525         $table_to_remove->DESTROY;
7526         return;
7527     }
7528
7529     sub table {
7530         # Return a pointer to the match table (with name given by the
7531         # parameter) associated with this property; undef if none.
7532
7533         my $self = shift;
7534         my $name = shift;
7535         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7536
7537         my $addr = do { no overloading; pack 'J', $self; };
7538
7539         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7540
7541         # If quick look-up failed, try again using the standard form of the
7542         # input name.  If that succeeds, cache the result before returning so
7543         # won't have to standardize this input name again.
7544         my $standard_name = main::standardize($name);
7545         return unless defined $table_ref{$addr}{$standard_name};
7546
7547         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7548         return $table_ref{$addr}{$name};
7549     }
7550
7551     sub tables {
7552         # Return a list of pointers to all the match tables attached to this
7553         # property
7554
7555         no overloading;
7556         return main::uniques(values %{$table_ref{pack 'J', shift}});
7557     }
7558
7559     sub directory {
7560         # Returns the directory the map table for this property should be
7561         # output in.  If a specific directory has been specified, that has
7562         # priority;  'undef' is returned if the type isn't defined;
7563         # or $map_directory for everything else.
7564
7565         my $addr = do { no overloading; pack 'J', shift; };
7566
7567         return $directory{$addr} if defined $directory{$addr};
7568         return undef if $type{$addr} == $UNKNOWN;
7569         return $map_directory;
7570     }
7571
7572     sub swash_name {
7573         # Return the name that is used to both:
7574         #   1)  Name the file that the map table is written to.
7575         #   2)  The name of swash related stuff inside that file.
7576         # The reason for this is that the Perl core historically has used
7577         # certain names that aren't the same as the Unicode property names.
7578         # To continue using these, $file is hard-coded in this file for those,
7579         # but otherwise the standard name is used.  This is different from the
7580         # external_name, so that the rest of the files, like in lib can use
7581         # the standard name always, without regard to historical precedent.
7582
7583         my $self = shift;
7584         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7585
7586         my $addr = do { no overloading; pack 'J', $self; };
7587
7588         return $file{$addr} if defined $file{$addr};
7589         return $map{$addr}->external_name;
7590     }
7591
7592     sub to_create_match_tables {
7593         # Returns a boolean as to whether or not match tables should be
7594         # created for this property.
7595
7596         my $self = shift;
7597         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7598
7599         # The whole point of this pseudo property is match tables.
7600         return 1 if $self == $perl;
7601
7602         my $addr = do { no overloading; pack 'J', $self; };
7603
7604         # Don't generate tables of code points that match the property values
7605         # of a string property.  Such a list would most likely have many
7606         # property values, each with just one or very few code points mapping
7607         # to it.
7608         return 0 if $type{$addr} == $STRING;
7609
7610         # Don't generate anything for unimplemented properties.
7611         return 0 if grep { $self->complete_name eq $_ }
7612                                                     @unimplemented_properties;
7613         # Otherwise, do.
7614         return 1;
7615     }
7616
7617     sub property_add_or_replace_non_nulls {
7618         # This adds the mappings in the property $other to $self.  Non-null
7619         # mappings from $other override those in $self.  It essentially merges
7620         # the two properties, with the second having priority except for null
7621         # mappings.
7622
7623         my $self = shift;
7624         my $other = shift;
7625         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7626
7627         if (! $other->isa(__PACKAGE__)) {
7628             Carp::my_carp_bug("$other should be a "
7629                             . __PACKAGE__
7630                             . ".  Not a '"
7631                             . ref($other)
7632                             . "'.  Not added;");
7633             return;
7634         }
7635
7636         no overloading;
7637         return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
7638     }
7639
7640     sub set_type {
7641         # Set the type of the property.  Mostly this is figured out by the
7642         # data in the table.  But this is used to set it explicitly.  The
7643         # reason it is not a standard accessor is that when setting a binary
7644         # property, we need to make sure that all the true/false aliases are
7645         # present, as they were omitted in early Unicode releases.
7646
7647         my $self = shift;
7648         my $type = shift;
7649         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7650
7651         if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
7652             Carp::my_carp("Unrecognized type '$type'.  Type not set");
7653             return;
7654         }
7655
7656         { no overloading; $type{pack 'J', $self} = $type; }
7657         return if $type != $BINARY;
7658
7659         my $yes = $self->table('Y');
7660         $yes = $self->table('Yes') if ! defined $yes;
7661         $yes = $self->add_match_table('Y', Full_Name => 'Yes')
7662                                                             if ! defined $yes;
7663
7664         # Add aliases in order wanted, duplicates will be ignored.  Note, that
7665         # could run into problems in outputting things in that we don't
7666         # distinguish between the name and full name of these.  Hopefully, if
7667         # the table was already created before this code is executed, it was
7668         # done with these set properly.
7669         $yes->add_alias('Y');
7670         $yes->add_alias('Yes');
7671         $yes->add_alias('T');
7672         $yes->add_alias('True');
7673
7674         my $no = $self->table('N');
7675         $no = $self->table('No') if ! defined $no;
7676         $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
7677         $no->add_alias('N');
7678         $no->add_alias('No');
7679         $no->add_alias('F');
7680         $no->add_alias('False');
7681
7682         return;
7683     }
7684
7685     sub add_map {
7686         # Add a map to the property's map table.  This also keeps
7687         # track of the maps so that the property type can be determined from
7688         # its data.
7689
7690         my $self = shift;
7691         my $start = shift;  # First code point in range
7692         my $end = shift;    # Final code point in range
7693         my $map = shift;    # What the range maps to.
7694         # Rest of parameters passed on.
7695
7696         my $addr = do { no overloading; pack 'J', $self; };
7697
7698         # If haven't the type of the property, gather information to figure it
7699         # out.
7700         if ($type{$addr} == $UNKNOWN) {
7701
7702             # If the map contains an interior blank or dash, or most other
7703             # nonword characters, it will be a string property.  This
7704             # heuristic may actually miss some string properties.  If so, they
7705             # may need to have explicit set_types called for them.  This
7706             # happens in the Unihan properties.
7707             if ($map =~ / (?<= . ) [ -] (?= . ) /x
7708                 || $map =~ / [^\w.\/\ -]  /x)
7709             {
7710                 $self->set_type($STRING);
7711
7712                 # $unique_maps is used for disambiguating between ENUM and
7713                 # BINARY later; since we know the property is not going to be
7714                 # one of those, no point in keeping the data around
7715                 undef $unique_maps{$addr};
7716             }
7717             else {
7718
7719                 # Not necessarily a string.  The final decision has to be
7720                 # deferred until all the data are in.  We keep track of if all
7721                 # the values are code points for that eventual decision.
7722                 $has_only_code_point_maps{$addr} &=
7723                                             $map =~ / ^ $code_point_re $/x;
7724
7725                 # For the purposes of disambiguating between binary and other
7726                 # enumerations at the end, we keep track of the first three
7727                 # distinct property values.  Once we get to three, we know
7728                 # it's not going to be binary, so no need to track more.
7729                 if (scalar keys %{$unique_maps{$addr}} < 3) {
7730                     $unique_maps{$addr}{main::standardize($map)} = 1;
7731                 }
7732             }
7733         }
7734
7735         # Add the mapping by calling our map table's method
7736         return $map{$addr}->add_map($start, $end, $map, @_);
7737     }
7738
7739     sub compute_type {
7740         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
7741         # should be called after the property is mostly filled with its maps.
7742         # We have been keeping track of what the property values have been,
7743         # and now have the necessary information to figure out the type.
7744
7745         my $self = shift;
7746         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7747
7748         my $addr = do { no overloading; pack 'J', $self; };
7749
7750         my $type = $type{$addr};
7751
7752         # If already have figured these out, no need to do so again, but we do
7753         # a double check on ENUMS to make sure that a string property hasn't
7754         # improperly been classified as an ENUM, so continue on with those.
7755         return if $type == $STRING || $type == $BINARY;
7756
7757         # If every map is to a code point, is a string property.
7758         if ($type == $UNKNOWN
7759             && ($has_only_code_point_maps{$addr}
7760                 || (defined $map{$addr}->default_map
7761                     && $map{$addr}->default_map eq "")))
7762         {
7763             $self->set_type($STRING);
7764         }
7765         else {
7766
7767             # Otherwise, it is to some sort of enumeration.  (The case where
7768             # it is a Unicode miscellaneous property, and treated like a
7769             # string in this program is handled in add_map()).  Distinguish
7770             # between binary and some other enumeration type.  Of course, if
7771             # there are more than two values, it's not binary.  But more
7772             # subtle is the test that the default mapping is defined means it
7773             # isn't binary.  This in fact may change in the future if Unicode
7774             # changes the way its data is structured.  But so far, no binary
7775             # properties ever have @missing lines for them, so the default map
7776             # isn't defined for them.  The few properties that are two-valued
7777             # and aren't considered binary have the default map defined
7778             # starting in Unicode 5.0, when the @missing lines appeared; and
7779             # this program has special code to put in a default map for them
7780             # for earlier than 5.0 releases.
7781             if ($type == $ENUM
7782                 || scalar keys %{$unique_maps{$addr}} > 2
7783                 || defined $self->default_map)
7784             {
7785                 my $tables = $self->tables;
7786                 my $count = $self->count;
7787                 if ($verbosity && $count > 500 && $tables/$count > .1) {
7788                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7789                 }
7790                 $self->set_type($ENUM);
7791             }
7792             else {
7793                 $self->set_type($BINARY);
7794             }
7795         }
7796         undef $unique_maps{$addr};  # Garbage collect
7797         return;
7798     }
7799
7800     # Most of the accessors for a property actually apply to its map table.
7801     # Setup up accessor functions for those, referring to %map
7802     for my $sub (qw(
7803                     add_alias
7804                     add_anomalous_entry
7805                     add_comment
7806                     add_conflicting
7807                     add_description
7808                     add_duplicate
7809                     add_note
7810                     aliases
7811                     comment
7812                     complete_name
7813                     containing_range
7814                     core_access
7815                     count
7816                     default_map
7817                     delete_range
7818                     description
7819                     each_range
7820                     external_name
7821                     file_path
7822                     format
7823                     initialize
7824                     inverse_list
7825                     is_empty
7826                     name
7827                     note
7828                     perl_extension
7829                     property
7830                     range_count
7831                     ranges
7832                     range_size_1
7833                     reset_each_range
7834                     set_comment
7835                     set_core_access
7836                     set_default_map
7837                     set_file_path
7838                     set_final_comment
7839                     set_range_size_1
7840                     set_status
7841                     set_to_output_map
7842                     short_name
7843                     status
7844                     status_info
7845                     to_output_map
7846                     type_of
7847                     value_of
7848                     write
7849                 ))
7850                     # 'property' above is for symmetry, so that one can take
7851                     # the property of a property and get itself, and so don't
7852                     # have to distinguish between properties and tables in
7853                     # calling code
7854     {
7855         no strict "refs";
7856         *$sub = sub {
7857             use strict "refs";
7858             my $self = shift;
7859             no overloading;
7860             return $map{pack 'J', $self}->$sub(@_);
7861         }
7862     }
7863
7864
7865 } # End closure
7866
7867 package main;
7868
7869 sub join_lines($) {
7870     # Returns lines of the input joined together, so that they can be folded
7871     # properly.
7872     # This causes continuation lines to be joined together into one long line
7873     # for folding.  A continuation line is any line that doesn't begin with a
7874     # space or "\b" (the latter is stripped from the output).  This is so
7875     # lines can be be in a HERE document so as to fit nicely in the terminal
7876     # width, but be joined together in one long line, and then folded with
7877     # indents, '#' prefixes, etc, properly handled.
7878     # A blank separates the joined lines except if there is a break; an extra
7879     # blank is inserted after a period ending a line.
7880
7881     # Initialize the return with the first line.
7882     my ($return, @lines) = split "\n", shift;
7883
7884     # If the first line is null, it was an empty line, add the \n back in
7885     $return = "\n" if $return eq "";
7886
7887     # Now join the remainder of the physical lines.
7888     for my $line (@lines) {
7889
7890         # An empty line means wanted a blank line, so add two \n's to get that
7891         # effect, and go to the next line.
7892         if (length $line == 0) {
7893             $return .= "\n\n";
7894             next;
7895         }
7896
7897         # Look at the last character of what we have so far.
7898         my $previous_char = substr($return, -1, 1);
7899
7900         # And at the next char to be output.
7901         my $next_char = substr($line, 0, 1);
7902
7903         if ($previous_char ne "\n") {
7904
7905             # Here didn't end wth a nl.  If the next char a blank or \b, it
7906             # means that here there is a break anyway.  So add a nl to the
7907             # output.
7908             if ($next_char eq " " || $next_char eq "\b") {
7909                 $previous_char = "\n";
7910                 $return .= $previous_char;
7911             }
7912
7913             # Add an extra space after periods.
7914             $return .= " " if $previous_char eq '.';
7915         }
7916
7917         # Here $previous_char is still the latest character to be output.  If
7918         # it isn't a nl, it means that the next line is to be a continuation
7919         # line, with a blank inserted between them.
7920         $return .= " " if $previous_char ne "\n";
7921
7922         # Get rid of any \b
7923         substr($line, 0, 1) = "" if $next_char eq "\b";
7924
7925         # And append this next line.
7926         $return .= $line;
7927     }
7928
7929     return $return;
7930 }
7931
7932 sub simple_fold($;$$$) {
7933     # Returns a string of the input (string or an array of strings) folded
7934     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7935     # a \n
7936     # This is tailored for the kind of text written by this program,
7937     # especially the pod file, which can have very long names with
7938     # underscores in the middle, or words like AbcDefgHij....  We allow
7939     # breaking in the middle of such constructs if the line won't fit
7940     # otherwise.  The break in such cases will come either just after an
7941     # underscore, or just before one of the Capital letters.
7942
7943     local $to_trace = 0 if main::DEBUG;
7944
7945     my $line = shift;
7946     my $prefix = shift;     # Optional string to prepend to each output
7947                             # line
7948     $prefix = "" unless defined $prefix;
7949
7950     my $hanging_indent = shift; # Optional number of spaces to indent
7951                                 # continuation lines
7952     $hanging_indent = 0 unless $hanging_indent;
7953
7954     my $right_margin = shift;   # Optional number of spaces to narrow the
7955                                 # total width by.
7956     $right_margin = 0 unless defined $right_margin;
7957
7958     # Call carp with the 'nofold' option to avoid it from trying to call us
7959     # recursively
7960     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7961
7962     # The space available doesn't include what's automatically prepended
7963     # to each line, or what's reserved on the right.
7964     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7965     # XXX Instead of using the 'nofold' perhaps better to look up the stack
7966
7967     if (DEBUG && $hanging_indent >= $max) {
7968         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
7969         $hanging_indent = 0;
7970     }
7971
7972     # First, split into the current physical lines.
7973     my @line;
7974     if (ref $line) {        # Better be an array, because not bothering to
7975                             # test
7976         foreach my $line (@{$line}) {
7977             push @line, split /\n/, $line;
7978         }
7979     }
7980     else {
7981         @line = split /\n/, $line;
7982     }
7983
7984     #local $to_trace = 1 if main::DEBUG;
7985     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7986
7987     # Look at each current physical line.
7988     for (my $i = 0; $i < @line; $i++) {
7989         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7990         #local $to_trace = 1 if main::DEBUG;
7991         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7992
7993         # Remove prefix, because will be added back anyway, don't want
7994         # doubled prefix
7995         $line[$i] =~ s/^$prefix//;
7996
7997         # Remove trailing space
7998         $line[$i] =~ s/\s+\Z//;
7999
8000         # If the line is too long, fold it.
8001         if (length $line[$i] > $max) {
8002             my $remainder;
8003
8004             # Here needs to fold.  Save the leading space in the line for
8005             # later.
8006             $line[$i] =~ /^ ( \s* )/x;
8007             my $leading_space = $1;
8008             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8009
8010             # If character at final permissible position is white space,
8011             # fold there, which will delete that white space
8012             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8013                 $remainder = substr($line[$i], $max);
8014                 $line[$i] = substr($line[$i], 0, $max - 1);
8015             }
8016             else {
8017
8018                 # Otherwise fold at an acceptable break char closest to
8019                 # the max length.  Look at just the maximal initial
8020                 # segment of the line
8021                 my $segment = substr($line[$i], 0, $max - 1);
8022                 if ($segment =~
8023                     /^ ( .{$hanging_indent}   # Don't look before the
8024                                               #  indent.
8025                         \ *                   # Don't look in leading
8026                                               #  blanks past the indent
8027                             [^ ] .*           # Find the right-most
8028                         (?:                   #  acceptable break:
8029                             [ \s = ]          # space or equal
8030                             | - (?! [.0-9] )  # or non-unary minus.
8031                         )                     # $1 includes the character
8032                     )/x)
8033                 {
8034                     # Split into the initial part that fits, and remaining
8035                     # part of the input
8036                     $remainder = substr($line[$i], length $1);
8037                     $line[$i] = $1;
8038                     trace $line[$i] if DEBUG && $to_trace;
8039                     trace $remainder if DEBUG && $to_trace;
8040                 }
8041
8042                 # If didn't find a good breaking spot, see if there is a
8043                 # not-so-good breaking spot.  These are just after
8044                 # underscores or where the case changes from lower to
8045                 # upper.  Use \a as a soft hyphen, but give up
8046                 # and don't break the line if there is actually a \a
8047                 # already in the input.  We use an ascii character for the
8048                 # soft-hyphen to avoid any attempt by miniperl to try to
8049                 # access the files that this program is creating.
8050                 elsif ($segment !~ /\a/
8051                        && ($segment =~ s/_/_\a/g
8052                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8053                 {
8054                     # Here were able to find at least one place to insert
8055                     # our substitute soft hyphen.  Find the right-most one
8056                     # and replace it by a real hyphen.
8057                     trace $segment if DEBUG && $to_trace;
8058                     substr($segment,
8059                             rindex($segment, "\a"),
8060                             1) = '-';
8061
8062                     # Then remove the soft hyphen substitutes.
8063                     $segment =~ s/\a//g;
8064                     trace $segment if DEBUG && $to_trace;
8065
8066                     # And split into the initial part that fits, and
8067                     # remainder of the line
8068                     my $pos = rindex($segment, '-');
8069                     $remainder = substr($line[$i], $pos);
8070                     trace $remainder if DEBUG && $to_trace;
8071                     $line[$i] = substr($segment, 0, $pos + 1);
8072                 }
8073             }
8074
8075             # Here we know if we can fold or not.  If we can, $remainder
8076             # is what remains to be processed in the next iteration.
8077             if (defined $remainder) {
8078                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8079
8080                 # Insert the folded remainder of the line as a new element
8081                 # of the array.  (It may still be too long, but we will
8082                 # deal with that next time through the loop.)  Omit any
8083                 # leading space in the remainder.
8084                 $remainder =~ s/^\s+//;
8085                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8086
8087                 # But then indent by whichever is larger of:
8088                 # 1) the leading space on the input line;
8089                 # 2) the hanging indent.
8090                 # This preserves indentation in the original line.
8091                 my $lead = ($leading_space)
8092                             ? length $leading_space
8093                             : $hanging_indent;
8094                 $lead = max($lead, $hanging_indent);
8095                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8096             }
8097         }
8098
8099         # Ready to output the line. Get rid of any trailing space
8100         # And prefix by the required $prefix passed in.
8101         $line[$i] =~ s/\s+$//;
8102         $line[$i] = "$prefix$line[$i]\n";
8103     } # End of looping through all the lines.
8104
8105     return join "", @line;
8106 }
8107
8108 sub property_ref {  # Returns a reference to a property object.
8109     return Property::property_ref(@_);
8110 }
8111
8112 sub force_unlink ($) {
8113     my $filename = shift;
8114     return unless file_exists($filename);
8115     return if CORE::unlink($filename);
8116
8117     # We might need write permission
8118     chmod 0777, $filename;
8119     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
8120     return;
8121 }
8122
8123 sub write ($$@) {
8124     # Given a filename and references to arrays of lines, write the lines of
8125     # each array to the file
8126     # Filename can be given as an arrayref of directory names
8127
8128     return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
8129
8130     my $file  = shift;
8131     my $use_utf8 = shift;
8132
8133     # Get into a single string if an array, and get rid of, in Unix terms, any
8134     # leading '.'
8135     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8136     $file = File::Spec->canonpath($file);
8137
8138     # If has directories, make sure that they all exist
8139     (undef, my $directories, undef) = File::Spec->splitpath($file);
8140     File::Path::mkpath($directories) if $directories && ! -d $directories;
8141
8142     push @files_actually_output, $file;
8143
8144     force_unlink ($file);
8145
8146     my $OUT;
8147     if (not open $OUT, ">", $file) {
8148         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
8149         return;
8150     }
8151
8152     binmode $OUT, ":utf8" if $use_utf8;
8153
8154     while (defined (my $lines_ref = shift)) {
8155         unless (@$lines_ref) {
8156             Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8157         }
8158
8159         print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8160     }
8161     close $OUT or die Carp::my_carp("close '$file' failed: $!");
8162
8163     print "$file written.\n" if $verbosity >= $VERBOSE;
8164
8165     return;
8166 }
8167
8168
8169 sub Standardize($) {
8170     # This converts the input name string into a standardized equivalent to
8171     # use internally.
8172
8173     my $name = shift;
8174     unless (defined $name) {
8175       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
8176       return;
8177     }
8178
8179     # Remove any leading or trailing white space
8180     $name =~ s/^\s+//g;
8181     $name =~ s/\s+$//g;
8182
8183     # Convert interior white space and hyphens into underscores.
8184     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8185
8186     # Capitalize the letter following an underscore, and convert a sequence of
8187     # multiple underscores to a single one
8188     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8189
8190     # And capitalize the first letter, but not for the special cjk ones.
8191     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8192     return $name;
8193 }
8194
8195 sub standardize ($) {
8196     # Returns a lower-cased standardized name, without underscores.  This form
8197     # is chosen so that it can distinguish between any real versus superficial
8198     # Unicode name differences.  It relies on the fact that Unicode doesn't
8199     # have interior underscores, white space, nor dashes in any
8200     # stricter-matched name.  It should not be used on Unicode code point
8201     # names (the Name property), as they mostly, but not always follow these
8202     # rules.
8203
8204     my $name = Standardize(shift);
8205     return if !defined $name;
8206
8207     $name =~ s/ (?<= .) _ (?= . ) //xg;
8208     return lc $name;
8209 }
8210
8211 sub utf8_heavy_name ($$) {
8212     # Returns the name that utf8_heavy.pl will use to find a table.  XXX
8213     # perhaps this function should be placed somewhere, like Heavy.pl so that
8214     # utf8_heavy can use it directly without duplicating code that can get
8215     # out-of sync.
8216
8217     my $table = shift;
8218     my $alias = shift;
8219     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8220
8221     my $property = $table->property;
8222     $property = ($property == $perl)
8223                 ? ""                # 'perl' is never explicitly stated
8224                 : standardize($property->name) . '=';
8225     if ($alias->loose_match) {
8226         return $property . standardize($alias->name);
8227     }
8228     else {
8229         return lc ($property . $alias->name);
8230     }
8231
8232     return;
8233 }
8234
8235 {   # Closure
8236
8237     my $indent_increment = " " x 2;
8238     my %already_output;
8239
8240     $main::simple_dumper_nesting = 0;
8241
8242     sub simple_dumper {
8243         # Like Simple Data::Dumper. Good enough for our needs. We can't use
8244         # the real thing as we have to run under miniperl.
8245
8246         # It is designed so that on input it is at the beginning of a line,
8247         # and the final thing output in any call is a trailing ",\n".
8248
8249         my $item = shift;
8250         my $indent = shift;
8251         $indent = "" if ! defined $indent;
8252
8253         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8254
8255         # nesting level is localized, so that as the call stack pops, it goes
8256         # back to the prior value.
8257         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8258         undef %already_output if $main::simple_dumper_nesting == 0;
8259         $main::simple_dumper_nesting++;
8260         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8261
8262         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8263
8264         # Determine the indent for recursive calls.
8265         my $next_indent = $indent . $indent_increment;
8266
8267         my $output;
8268         if (! ref $item) {
8269
8270             # Dump of scalar: just output it in quotes if not a number.  To do
8271             # so we must escape certain characters, and therefore need to
8272             # operate on a copy to avoid changing the original
8273             my $copy = $item;
8274             $copy = $UNDEF unless defined $copy;
8275
8276             # Quote non-numbers (numbers also have optional leading '-' and
8277             # fractions)
8278             if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
8279
8280                 # Escape apostrophe and backslash
8281                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8282                 $copy = "'$copy'";
8283             }
8284             $output = "$indent$copy,\n";
8285         }
8286         else {
8287
8288             # Keep track of cycles in the input, and refuse to infinitely loop
8289             my $addr = do { no overloading; pack 'J', $item; };
8290             if (defined $already_output{$addr}) {
8291                 return "${indent}ALREADY OUTPUT: $item\n";
8292             }
8293             $already_output{$addr} = $item;
8294
8295             if (ref $item eq 'ARRAY') {
8296                 my $using_brackets;
8297                 $output = $indent;
8298                 if ($main::simple_dumper_nesting > 1) {
8299                     $output .= '[';
8300                     $using_brackets = 1;
8301                 }
8302                 else {
8303                     $using_brackets = 0;
8304                 }
8305
8306                 # If the array is empty, put the closing bracket on the same
8307                 # line.  Otherwise, recursively add each array element
8308                 if (@$item == 0) {
8309                     $output .= " ";
8310                 }
8311                 else {
8312                     $output .= "\n";
8313                     for (my $i = 0; $i < @$item; $i++) {
8314
8315                         # Indent array elements one level
8316                         $output .= &simple_dumper($item->[$i], $next_indent);
8317                         $output =~ s/\n$//;      # Remove any trailing nl so
8318                         $output .= " # [$i]\n";  # as to add a comment giving
8319                                                  # the array index
8320                     }
8321                     $output .= $indent;     # Indent closing ']' to orig level
8322                 }
8323                 $output .= ']' if $using_brackets;
8324                 $output .= ",\n";
8325             }
8326             elsif (ref $item eq 'HASH') {
8327                 my $is_first_line;
8328                 my $using_braces;
8329                 my $body_indent;
8330
8331                 # No surrounding braces at top level
8332                 $output .= $indent;
8333                 if ($main::simple_dumper_nesting > 1) {
8334                     $output .= "{\n";
8335                     $is_first_line = 0;
8336                     $body_indent = $next_indent;
8337                     $next_indent .= $indent_increment;
8338                     $using_braces = 1;
8339                 }
8340                 else {
8341                     $is_first_line = 1;
8342                     $body_indent = $indent;
8343                     $using_braces = 0;
8344                 }
8345
8346                 # Output hashes sorted alphabetically instead of apparently
8347                 # random.  Use caseless alphabetic sort
8348                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8349                 {
8350                     if ($is_first_line) {
8351                         $is_first_line = 0;
8352                     }
8353                     else {
8354                         $output .= "$body_indent";
8355                     }
8356
8357                     # The key must be a scalar, but this recursive call quotes
8358                     # it
8359                     $output .= &simple_dumper($key);
8360
8361                     # And change the trailing comma and nl to the hash fat
8362                     # comma for clarity, and so the value can be on the same
8363                     # line
8364                     $output =~ s/,\n$/ => /;
8365
8366                     # Recursively call to get the value's dump.
8367                     my $next = &simple_dumper($item->{$key}, $next_indent);
8368
8369                     # If the value is all on one line, remove its indent, so
8370                     # will follow the => immediately.  If it takes more than
8371                     # one line, start it on a new line.
8372                     if ($next !~ /\n.*\n/) {
8373                         $next =~ s/^ *//;
8374                     }
8375                     else {
8376                         $output .= "\n";
8377                     }
8378                     $output .= $next;
8379                 }
8380
8381                 $output .= "$indent},\n" if $using_braces;
8382             }
8383             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8384                 $output = $indent . ref($item) . "\n";
8385                 # XXX see if blessed
8386             }
8387             elsif ($item->can('dump')) {
8388
8389                 # By convention in this program, objects furnish a 'dump'
8390                 # method.  Since not doing any output at this level, just pass
8391                 # on the input indent
8392                 $output = $item->dump($indent);
8393             }
8394             else {
8395                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
8396             }
8397         }
8398         return $output;
8399     }
8400 }
8401
8402 sub dump_inside_out {
8403     # Dump inside-out hashes in an object's state by converting them to a
8404     # regular hash and then calling simple_dumper on that.
8405
8406     my $object = shift;
8407     my $fields_ref = shift;
8408     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8409
8410     my $addr = do { no overloading; pack 'J', $object; };
8411
8412     my %hash;
8413     foreach my $key (keys %$fields_ref) {
8414         $hash{$key} = $fields_ref->{$key}{$addr};
8415     }
8416
8417     return simple_dumper(\%hash, @_);
8418 }
8419
8420 sub _operator_dot {
8421     # Overloaded '.' method that is common to all packages.  It uses the
8422     # package's stringify method.
8423
8424     my $self = shift;
8425     my $other = shift;
8426     my $reversed = shift;
8427     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8428
8429     $other = "" unless defined $other;
8430
8431     foreach my $which (\$self, \$other) {
8432         next unless ref $$which;
8433         if ($$which->can('_operator_stringify')) {
8434             $$which = $$which->_operator_stringify;
8435         }
8436         else {
8437             my $ref = ref $$which;
8438             my $addr = do { no overloading; pack 'J', $$which; };
8439             $$which = "$ref ($addr)";
8440         }
8441     }
8442     return ($reversed)
8443             ? "$other$self"
8444             : "$self$other";
8445 }
8446
8447 sub _operator_equal {
8448     # Generic overloaded '==' routine.  To be equal, they must be the exact
8449     # same object
8450
8451     my $self = shift;
8452     my $other = shift;
8453
8454     return 0 unless defined $other;
8455     return 0 unless ref $other;
8456     no overloading;
8457     return $self == $other;
8458 }
8459
8460 sub _operator_not_equal {
8461     my $self = shift;
8462     my $other = shift;
8463
8464     return ! _operator_equal($self, $other);
8465 }
8466
8467 sub process_PropertyAliases($) {
8468     # This reads in the PropertyAliases.txt file, which contains almost all
8469     # the character properties in Unicode and their equivalent aliases:
8470     # scf       ; Simple_Case_Folding         ; sfc
8471     #
8472     # Field 0 is the preferred short name for the property.
8473     # Field 1 is the full name.
8474     # Any succeeding ones are other accepted names.
8475
8476     my $file= shift;
8477     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8478
8479     # This whole file was non-existent in early releases, so use our own
8480     # internal one.
8481     $file->insert_lines(get_old_property_aliases())
8482                                                 if ! -e 'PropertyAliases.txt';
8483
8484     # Add any cjk properties that may have been defined.
8485     $file->insert_lines(@cjk_properties);
8486
8487     while ($file->next_line) {
8488
8489         my @data = split /\s*;\s*/;
8490
8491         my $full = $data[1];
8492
8493         my $this = Property->new($data[0], Full_Name => $full);
8494
8495         # Start looking for more aliases after these two.
8496         for my $i (2 .. @data - 1) {
8497             $this->add_alias($data[$i]);
8498         }
8499
8500     }
8501     return;
8502 }
8503
8504 sub finish_property_setup {
8505     # Finishes setting up after PropertyAliases.
8506
8507     my $file = shift;
8508     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8509
8510     # This entry was missing from this file in earlier Unicode versions
8511     if (-e 'Jamo.txt') {
8512         my $jsn = property_ref('JSN');
8513         if (! defined $jsn) {
8514             $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8515         }
8516     }
8517
8518     # This entry is still missing as of 6.0, perhaps because no short name for
8519     # it.
8520     if (-e 'NameAliases.txt') {
8521         my $aliases = property_ref('Name_Alias');
8522         if (! defined $aliases) {
8523             $aliases = Property->new('Name_Alias');
8524         }
8525     }
8526
8527     # These are used so much, that we set globals for them.
8528     $gc = property_ref('General_Category');
8529     $block = property_ref('Block');
8530
8531     # Perl adds this alias.
8532     $gc->add_alias('Category');
8533
8534     # For backwards compatibility, these property files have particular names.
8535     my $upper = property_ref('Uppercase_Mapping');
8536     $upper->set_core_access('uc()');
8537     $upper->set_file('Upper'); # This is what utf8.c calls it
8538
8539     my $lower = property_ref('Lowercase_Mapping');
8540     $lower->set_core_access('lc()');
8541     $lower->set_file('Lower');
8542
8543     my $title = property_ref('Titlecase_Mapping');
8544     $title->set_core_access('ucfirst()');
8545     $title->set_file('Title');
8546
8547     my $fold = property_ref('Case_Folding');
8548     $fold->set_file('Fold') if defined $fold;
8549
8550     # Unicode::Normalize expects this file with this name and directory.
8551     my $ccc = property_ref('Canonical_Combining_Class');
8552     if (defined $ccc) {
8553         $ccc->set_file('CombiningClass');
8554         $ccc->set_directory(File::Spec->curdir());
8555     }
8556
8557     # utf8.c has a different meaning for non range-size-1 for map properties
8558     # that this program doesn't currently handle; and even if it were changed
8559     # to do so, some other code may be using them expecting range size 1.
8560     foreach my $property (qw {
8561                                 Case_Folding
8562                                 Lowercase_Mapping
8563                                 Titlecase_Mapping
8564                                 Uppercase_Mapping
8565                             })
8566     {
8567         property_ref($property)->set_range_size_1(1);
8568     }
8569
8570     # These two properties aren't actually used in the core, but unfortunately
8571     # the names just above that are in the core interfere with these, so
8572     # choose different names.  These aren't a problem unless the map tables
8573     # for these files get written out.
8574     my $lowercase = property_ref('Lowercase');
8575     $lowercase->set_file('IsLower') if defined $lowercase;
8576     my $uppercase = property_ref('Uppercase');
8577     $uppercase->set_file('IsUpper') if defined $uppercase;
8578
8579     # Set up the hard-coded default mappings, but only on properties defined
8580     # for this release
8581     foreach my $property (keys %default_mapping) {
8582         my $property_object = property_ref($property);
8583         next if ! defined $property_object;
8584         my $default_map = $default_mapping{$property};
8585         $property_object->set_default_map($default_map);
8586
8587         # A map of <code point> implies the property is string.
8588         if ($property_object->type == $UNKNOWN
8589             && $default_map eq $CODE_POINT)
8590         {
8591             $property_object->set_type($STRING);
8592         }
8593     }
8594
8595     # The following use the Multi_Default class to create objects for
8596     # defaults.
8597
8598     # Bidi class has a complicated default, but the derived file takes care of
8599     # the complications, leaving just 'L'.
8600     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8601         property_ref('Bidi_Class')->set_default_map('L');
8602     }
8603     else {
8604         my $default;
8605
8606         # The derived file was introduced in 3.1.1.  The values below are
8607         # taken from table 3-8, TUS 3.0
8608         my $default_R =
8609             'my $default = Range_List->new;
8610              $default->add_range(0x0590, 0x05FF);
8611              $default->add_range(0xFB1D, 0xFB4F);'
8612         ;
8613
8614         # The defaults apply only to unassigned characters
8615         $default_R .= '$gc->table("Unassigned") & $default;';
8616
8617         if ($v_version lt v3.0.0) {
8618             $default = Multi_Default->new(R => $default_R, 'L');
8619         }
8620         else {
8621
8622             # AL apparently not introduced until 3.0:  TUS 2.x references are
8623             # not on-line to check it out
8624             my $default_AL =
8625                 'my $default = Range_List->new;
8626                  $default->add_range(0x0600, 0x07BF);
8627                  $default->add_range(0xFB50, 0xFDFF);
8628                  $default->add_range(0xFE70, 0xFEFF);'
8629             ;
8630
8631             # Non-character code points introduced in this release; aren't AL
8632             if ($v_version ge 3.1.0) {
8633                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8634             }
8635             $default_AL .= '$gc->table("Unassigned") & $default';
8636             $default = Multi_Default->new(AL => $default_AL,
8637                                           R => $default_R,
8638                                           'L');
8639         }
8640         property_ref('Bidi_Class')->set_default_map($default);
8641     }
8642
8643     # Joining type has a complicated default, but the derived file takes care
8644     # of the complications, leaving just 'U' (or Non_Joining), except the file
8645     # is bad in 3.1.0
8646     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8647         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8648             property_ref('Joining_Type')->set_default_map('Non_Joining');
8649         }
8650         else {
8651
8652             # Otherwise, there are not one, but two possibilities for the
8653             # missing defaults: T and U.
8654             # The missing defaults that evaluate to T are given by:
8655             # T = Mn + Cf - ZWNJ - ZWJ
8656             # where Mn and Cf are the general category values. In other words,
8657             # any non-spacing mark or any format control character, except
8658             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8659             # WIDTH JOINER (joining type C).
8660             my $default = Multi_Default->new(
8661                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8662                'Non_Joining');
8663             property_ref('Joining_Type')->set_default_map($default);
8664         }
8665     }
8666
8667     # Line break has a complicated default in early releases. It is 'Unknown'
8668     # for non-assigned code points; 'AL' for assigned.
8669     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8670         my $lb = property_ref('Line_Break');
8671         if ($v_version gt 3.2.0) {
8672             $lb->set_default_map('Unknown');
8673         }
8674         else {
8675             my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8676                                               'AL');
8677             $lb->set_default_map($default);
8678         }
8679
8680         # If has the URS property, make sure that the standard aliases are in
8681         # it, since not in the input tables in some versions.
8682         my $urs = property_ref('Unicode_Radical_Stroke');
8683         if (defined $urs) {
8684             $urs->add_alias('cjkRSUnicode');
8685             $urs->add_alias('kRSUnicode');
8686         }
8687     }
8688     return;
8689 }
8690
8691 sub get_old_property_aliases() {
8692     # Returns what would be in PropertyAliases.txt if it existed in very old
8693     # versions of Unicode.  It was derived from the one in 3.2, and pared
8694     # down based on the data that was actually in the older releases.
8695     # An attempt was made to use the existence of files to mean inclusion or
8696     # not of various aliases, but if this was not sufficient, using version
8697     # numbers was resorted to.
8698
8699     my @return;
8700
8701     # These are to be used in all versions (though some are constructed by
8702     # this program if missing)
8703     push @return, split /\n/, <<'END';
8704 bc        ; Bidi_Class
8705 Bidi_M    ; Bidi_Mirrored
8706 cf        ; Case_Folding
8707 ccc       ; Canonical_Combining_Class
8708 dm        ; Decomposition_Mapping
8709 dt        ; Decomposition_Type
8710 gc        ; General_Category
8711 isc       ; ISO_Comment
8712 lc        ; Lowercase_Mapping
8713 na        ; Name
8714 na1       ; Unicode_1_Name
8715 nt        ; Numeric_Type
8716 nv        ; Numeric_Value
8717 sfc       ; Simple_Case_Folding
8718 slc       ; Simple_Lowercase_Mapping
8719 stc       ; Simple_Titlecase_Mapping
8720 suc       ; Simple_Uppercase_Mapping
8721 tc        ; Titlecase_Mapping
8722 uc        ; Uppercase_Mapping
8723 END
8724
8725     if (-e 'Blocks.txt') {
8726         push @return, "blk       ; Block\n";
8727     }
8728     if (-e 'ArabicShaping.txt') {
8729         push @return, split /\n/, <<'END';
8730 jg        ; Joining_Group
8731 jt        ; Joining_Type
8732 END
8733     }
8734     if (-e 'PropList.txt') {
8735
8736         # This first set is in the original old-style proplist.
8737         push @return, split /\n/, <<'END';
8738 Alpha     ; Alphabetic
8739 Bidi_C    ; Bidi_Control
8740 Dash      ; Dash
8741 Dia       ; Diacritic
8742 Ext       ; Extender
8743 Hex       ; Hex_Digit
8744 Hyphen    ; Hyphen
8745 IDC       ; ID_Continue
8746 Ideo      ; Ideographic
8747 Join_C    ; Join_Control
8748 Math      ; Math
8749 QMark     ; Quotation_Mark
8750 Term      ; Terminal_Punctuation
8751 WSpace    ; White_Space
8752 END
8753         # The next sets were added later
8754         if ($v_version ge v3.0.0) {
8755             push @return, split /\n/, <<'END';
8756 Upper     ; Uppercase
8757 Lower     ; Lowercase
8758 END
8759         }
8760         if ($v_version ge v3.0.1) {
8761             push @return, split /\n/, <<'END';
8762 NChar     ; Noncharacter_Code_Point
8763 END
8764         }
8765         # The next sets were added in the new-style
8766         if ($v_version ge v3.1.0) {
8767             push @return, split /\n/, <<'END';
8768 OAlpha    ; Other_Alphabetic
8769 OLower    ; Other_Lowercase
8770 OMath     ; Other_Math
8771 OUpper    ; Other_Uppercase
8772 END
8773         }
8774         if ($v_version ge v3.1.1) {
8775             push @return, "AHex      ; ASCII_Hex_Digit\n";
8776         }
8777     }
8778     if (-e 'EastAsianWidth.txt') {
8779         push @return, "ea        ; East_Asian_Width\n";
8780     }
8781     if (-e 'CompositionExclusions.txt') {
8782         push @return, "CE        ; Composition_Exclusion\n";
8783     }
8784     if (-e 'LineBreak.txt') {
8785         push @return, "lb        ; Line_Break\n";
8786     }
8787     if (-e 'BidiMirroring.txt') {
8788         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
8789     }
8790     if (-e 'Scripts.txt') {
8791         push @return, "sc        ; Script\n";
8792     }
8793     if (-e 'DNormalizationProps.txt') {
8794         push @return, split /\n/, <<'END';
8795 Comp_Ex   ; Full_Composition_Exclusion
8796 FC_NFKC   ; FC_NFKC_Closure
8797 NFC_QC    ; NFC_Quick_Check
8798 NFD_QC    ; NFD_Quick_Check
8799 NFKC_QC   ; NFKC_Quick_Check
8800 NFKD_QC   ; NFKD_Quick_Check
8801 XO_NFC    ; Expands_On_NFC
8802 XO_NFD    ; Expands_On_NFD
8803 XO_NFKC   ; Expands_On_NFKC
8804 XO_NFKD   ; Expands_On_NFKD
8805 END
8806     }
8807     if (-e 'DCoreProperties.txt') {
8808         push @return, split /\n/, <<'END';
8809 IDS       ; ID_Start
8810 XIDC      ; XID_Continue
8811 XIDS      ; XID_Start
8812 END
8813         # These can also appear in some versions of PropList.txt
8814         push @return, "Lower     ; Lowercase\n"
8815                                     unless grep { $_ =~ /^Lower\b/} @return;
8816         push @return, "Upper     ; Uppercase\n"
8817                                     unless grep { $_ =~ /^Upper\b/} @return;
8818     }
8819
8820     # This flag requires the DAge.txt file to be copied into the directory.
8821     if (DEBUG && $compare_versions) {
8822         push @return, 'age       ; Age';
8823     }
8824
8825     return @return;
8826 }
8827
8828 sub process_PropValueAliases {
8829     # This file contains values that properties look like:
8830     # bc ; AL        ; Arabic_Letter
8831     # blk; n/a       ; Greek_And_Coptic                 ; Greek
8832     #
8833     # Field 0 is the property.
8834     # Field 1 is the short name of a property value or 'n/a' if no
8835     #                short name exists;
8836     # Field 2 is the full property value name;
8837     # Any other fields are more synonyms for the property value.
8838     # Purely numeric property values are omitted from the file; as are some
8839     # others, fewer and fewer in later releases
8840
8841     # Entries for the ccc property have an extra field before the
8842     # abbreviation:
8843     # ccc;   0; NR   ; Not_Reordered
8844     # It is the numeric value that the names are synonyms for.
8845
8846     # There are comment entries for values missing from this file:
8847     # # @missing: 0000..10FFFF; ISO_Comment; <none>
8848     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8849
8850     my $file= shift;
8851     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8852
8853     # This whole file was non-existent in early releases, so use our own
8854     # internal one if necessary.
8855     if (! -e 'PropValueAliases.txt') {
8856         $file->insert_lines(get_old_property_value_aliases());
8857     }
8858
8859     # Add any explicit cjk values
8860     $file->insert_lines(@cjk_property_values);
8861
8862     # This line is used only for testing the code that checks for name
8863     # conflicts.  There is a script Inherited, and when this line is executed
8864     # it causes there to be a name conflict with the 'Inherited' that this
8865     # program generates for this block property value
8866     #$file->insert_lines('blk; n/a; Herited');
8867
8868
8869     # Process each line of the file ...
8870     while ($file->next_line) {
8871
8872         my ($property, @data) = split /\s*;\s*/;
8873
8874         # The ccc property has an extra field at the beginning, which is the
8875         # numeric value.  Move it to be after the other two, mnemonic, fields,
8876         # so that those will be used as the property value's names, and the
8877         # number will be an extra alias.  (Rightmost splice removes field 1-2,
8878         # returning them in a slice; left splice inserts that before anything,
8879         # thus shifting the former field 0 to after them.)
8880         splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
8881
8882         # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
8883         # there is no short name, use the full one in element 1
8884         if ($data[0] eq "n/a") {
8885             $data[0] = $data[1];
8886         }
8887         elsif ($data[0] ne $data[1]
8888                && standardize($data[0]) eq standardize($data[1])
8889                && $data[1] !~ /[[:upper:]]/)
8890         {
8891             # Also, there is a bug in the file in which "n/a" is omitted, and
8892             # the two fields are identical except for case, and the full name
8893             # is all lower case.  Copy the "short" name unto the full one to
8894             # give it some upper case.
8895
8896             $data[1] = $data[0];
8897         }
8898
8899         # Earlier releases had the pseudo property 'qc' that should expand to
8900         # the ones that replace it below.
8901         if ($property eq 'qc') {
8902             if (lc $data[0] eq 'y') {
8903                 $file->insert_lines('NFC_QC; Y      ; Yes',
8904                                     'NFD_QC; Y      ; Yes',
8905                                     'NFKC_QC; Y     ; Yes',
8906                                     'NFKD_QC; Y     ; Yes',
8907                                     );
8908             }
8909             elsif (lc $data[0] eq 'n') {
8910                 $file->insert_lines('NFC_QC; N      ; No',
8911                                     'NFD_QC; N      ; No',
8912                                     'NFKC_QC; N     ; No',
8913                                     'NFKD_QC; N     ; No',
8914                                     );
8915             }
8916             elsif (lc $data[0] eq 'm') {
8917                 $file->insert_lines('NFC_QC; M      ; Maybe',
8918                                     'NFKC_QC; M     ; Maybe',
8919                                     );
8920             }
8921             else {
8922                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8923             }
8924             next;
8925         }
8926
8927         # The first field is the short name, 2nd is the full one.
8928         my $property_object = property_ref($property);
8929         my $table = $property_object->add_match_table($data[0],
8930                                                 Full_Name => $data[1]);
8931
8932         # Start looking for more aliases after these two.
8933         for my $i (2 .. @data - 1) {
8934             $table->add_alias($data[$i]);
8935         }
8936     } # End of looping through the file
8937
8938     # As noted in the comments early in the program, it generates tables for
8939     # the default values for all releases, even those for which the concept
8940     # didn't exist at the time.  Here we add those if missing.
8941     my $age = property_ref('age');
8942     if (defined $age && ! defined $age->table('Unassigned')) {
8943         $age->add_match_table('Unassigned');
8944     }
8945     $block->add_match_table('No_Block') if -e 'Blocks.txt'
8946                                     && ! defined $block->table('No_Block');
8947
8948
8949     # Now set the default mappings of the properties from the file.  This is
8950     # done after the loop because a number of properties have only @missings
8951     # entries in the file, and may not show up until the end.
8952     my @defaults = $file->get_missings;
8953     foreach my $default_ref (@defaults) {
8954         my $default = $default_ref->[0];
8955         my $property = property_ref($default_ref->[1]);
8956         $property->set_default_map($default);
8957     }
8958     return;
8959 }
8960
8961 sub get_old_property_value_aliases () {
8962     # Returns what would be in PropValueAliases.txt if it existed in very old
8963     # versions of Unicode.  It was derived from the one in 3.2, and pared
8964     # down.  An attempt was made to use the existence of files to mean
8965     # inclusion or not of various aliases, but if this was not sufficient,
8966     # using version numbers was resorted to.
8967
8968     my @return = split /\n/, <<'END';
8969 bc ; AN        ; Arabic_Number
8970 bc ; B         ; Paragraph_Separator
8971 bc ; CS        ; Common_Separator
8972 bc ; EN        ; European_Number
8973 bc ; ES        ; European_Separator
8974 bc ; ET        ; European_Terminator
8975 bc ; L         ; Left_To_Right
8976 bc ; ON        ; Other_Neutral
8977 bc ; R         ; Right_To_Left
8978 bc ; WS        ; White_Space
8979
8980 # The standard combining classes are very much different in v1, so only use
8981 # ones that look right (not checked thoroughly)
8982 ccc;   0; NR   ; Not_Reordered
8983 ccc;   1; OV   ; Overlay
8984 ccc;   7; NK   ; Nukta
8985 ccc;   8; KV   ; Kana_Voicing
8986 ccc;   9; VR   ; Virama
8987 ccc; 202; ATBL ; Attached_Below_Left
8988 ccc; 216; ATAR ; Attached_Above_Right
8989 ccc; 218; BL   ; Below_Left
8990 ccc; 220; B    ; Below
8991 ccc; 222; BR   ; Below_Right
8992 ccc; 224; L    ; Left
8993 ccc; 228; AL   ; Above_Left
8994 ccc; 230; A    ; Above
8995 ccc; 232; AR   ; Above_Right
8996 ccc; 234; DA   ; Double_Above
8997
8998 dt ; can       ; canonical
8999 dt ; enc       ; circle
9000 dt ; fin       ; final
9001 dt ; font      ; font
9002 dt ; fra       ; fraction
9003 dt ; init      ; initial
9004 dt ; iso       ; isolated
9005 dt ; med       ; medial
9006 dt ; n/a       ; none
9007 dt ; nb        ; noBreak
9008 dt ; sqr       ; square
9009 dt ; sub       ; sub
9010 dt ; sup       ; super
9011
9012 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
9013 gc ; Cc        ; Control
9014 gc ; Cn        ; Unassigned
9015 gc ; Co        ; Private_Use
9016 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
9017 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
9018 gc ; Ll        ; Lowercase_Letter
9019 gc ; Lm        ; Modifier_Letter
9020 gc ; Lo        ; Other_Letter
9021 gc ; Lu        ; Uppercase_Letter
9022 gc ; M         ; Mark                             # Mc | Me | Mn
9023 gc ; Mc        ; Spacing_Mark
9024 gc ; Mn        ; Nonspacing_Mark
9025 gc ; N         ; Number                           # Nd | Nl | No
9026 gc ; Nd        ; Decimal_Number
9027 gc ; No        ; Other_Number
9028 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
9029 gc ; Pd        ; Dash_Punctuation
9030 gc ; Pe        ; Close_Punctuation
9031 gc ; Po        ; Other_Punctuation
9032 gc ; Ps        ; Open_Punctuation
9033 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
9034 gc ; Sc        ; Currency_Symbol
9035 gc ; Sm        ; Math_Symbol
9036 gc ; So        ; Other_Symbol
9037 gc ; Z         ; Separator                        # Zl | Zp | Zs
9038 gc ; Zl        ; Line_Separator
9039 gc ; Zp        ; Paragraph_Separator
9040 gc ; Zs        ; Space_Separator
9041
9042 nt ; de        ; Decimal
9043 nt ; di        ; Digit
9044 nt ; n/a       ; None
9045 nt ; nu        ; Numeric
9046 END
9047
9048     if (-e 'ArabicShaping.txt') {
9049         push @return, split /\n/, <<'END';
9050 jg ; n/a       ; AIN
9051 jg ; n/a       ; ALEF
9052 jg ; n/a       ; DAL
9053 jg ; n/a       ; GAF
9054 jg ; n/a       ; LAM
9055 jg ; n/a       ; MEEM
9056 jg ; n/a       ; NO_JOINING_GROUP
9057 jg ; n/a       ; NOON
9058 jg ; n/a       ; QAF
9059 jg ; n/a       ; SAD
9060 jg ; n/a       ; SEEN
9061 jg ; n/a       ; TAH
9062 jg ; n/a       ; WAW
9063
9064 jt ; C         ; Join_Causing
9065 jt ; D         ; Dual_Joining
9066 jt ; L         ; Left_Joining
9067 jt ; R         ; Right_Joining
9068 jt ; U         ; Non_Joining
9069 jt ; T         ; Transparent
9070 END
9071         if ($v_version ge v3.0.0) {
9072             push @return, split /\n/, <<'END';
9073 jg ; n/a       ; ALAPH
9074 jg ; n/a       ; BEH
9075 jg ; n/a       ; BETH
9076 jg ; n/a       ; DALATH_RISH
9077 jg ; n/a       ; E
9078 jg ; n/a       ; FEH
9079 jg ; n/a       ; FINAL_SEMKATH
9080 jg ; n/a       ; GAMAL
9081 jg ; n/a       ; HAH
9082 jg ; n/a       ; HAMZA_ON_HEH_GOAL
9083 jg ; n/a       ; HE
9084 jg ; n/a       ; HEH
9085 jg ; n/a       ; HEH_GOAL
9086 jg ; n/a       ; HETH
9087 jg ; n/a       ; KAF
9088 jg ; n/a       ; KAPH
9089 jg ; n/a       ; KNOTTED_HEH
9090 jg ; n/a       ; LAMADH
9091 jg ; n/a       ; MIM
9092 jg ; n/a       ; NUN
9093 jg ; n/a       ; PE
9094 jg ; n/a       ; QAPH
9095 jg ; n/a       ; REH
9096 jg ; n/a       ; REVERSED_PE
9097 jg ; n/a       ; SADHE
9098 jg ; n/a       ; SEMKATH
9099 jg ; n/a       ; SHIN
9100 jg ; n/a       ; SWASH_KAF
9101 jg ; n/a       ; TAW
9102 jg ; n/a       ; TEH_MARBUTA
9103 jg ; n/a       ; TETH
9104 jg ; n/a       ; YEH
9105 jg ; n/a       ; YEH_BARREE
9106 jg ; n/a       ; YEH_WITH_TAIL
9107 jg ; n/a       ; YUDH
9108 jg ; n/a       ; YUDH_HE
9109 jg ; n/a       ; ZAIN
9110 END
9111         }
9112     }
9113
9114
9115     if (-e 'EastAsianWidth.txt') {
9116         push @return, split /\n/, <<'END';
9117 ea ; A         ; Ambiguous
9118 ea ; F         ; Fullwidth
9119 ea ; H         ; Halfwidth
9120 ea ; N         ; Neutral
9121 ea ; Na        ; Narrow
9122 ea ; W         ; Wide
9123 END
9124     }
9125
9126     if (-e 'LineBreak.txt') {
9127         push @return, split /\n/, <<'END';
9128 lb ; AI        ; Ambiguous
9129 lb ; AL        ; Alphabetic
9130 lb ; B2        ; Break_Both
9131 lb ; BA        ; Break_After
9132 lb ; BB        ; Break_Before
9133 lb ; BK        ; Mandatory_Break
9134 lb ; CB        ; Contingent_Break
9135 lb ; CL        ; Close_Punctuation
9136 lb ; CM        ; Combining_Mark
9137 lb ; CR        ; Carriage_Return
9138 lb ; EX        ; Exclamation
9139 lb ; GL        ; Glue
9140 lb ; HY        ; Hyphen
9141 lb ; ID        ; Ideographic
9142 lb ; IN        ; Inseperable
9143 lb ; IS        ; Infix_Numeric
9144 lb ; LF        ; Line_Feed
9145 lb ; NS        ; Nonstarter
9146 lb ; NU        ; Numeric
9147 lb ; OP        ; Open_Punctuation
9148 lb ; PO        ; Postfix_Numeric
9149 lb ; PR        ; Prefix_Numeric
9150 lb ; QU        ; Quotation
9151 lb ; SA        ; Complex_Context
9152 lb ; SG        ; Surrogate
9153 lb ; SP        ; Space
9154 lb ; SY        ; Break_Symbols
9155 lb ; XX        ; Unknown
9156 lb ; ZW        ; ZWSpace
9157 END
9158     }
9159
9160     if (-e 'DNormalizationProps.txt') {
9161         push @return, split /\n/, <<'END';
9162 qc ; M         ; Maybe
9163 qc ; N         ; No
9164 qc ; Y         ; Yes
9165 END
9166     }
9167
9168     if (-e 'Scripts.txt') {
9169         push @return, split /\n/, <<'END';
9170 sc ; Arab      ; Arabic
9171 sc ; Armn      ; Armenian
9172 sc ; Beng      ; Bengali
9173 sc ; Bopo      ; Bopomofo
9174 sc ; Cans      ; Canadian_Aboriginal
9175 sc ; Cher      ; Cherokee
9176 sc ; Cyrl      ; Cyrillic
9177 sc ; Deva      ; Devanagari
9178 sc ; Dsrt      ; Deseret
9179 sc ; Ethi      ; Ethiopic
9180 sc ; Geor      ; Georgian
9181 sc ; Goth      ; Gothic
9182 sc ; Grek      ; Greek
9183 sc ; Gujr      ; Gujarati
9184 sc ; Guru      ; Gurmukhi
9185 sc ; Hang      ; Hangul
9186 sc ; Hani      ; Han
9187 sc ; Hebr      ; Hebrew
9188 sc ; Hira      ; Hiragana
9189 sc ; Ital      ; Old_Italic
9190 sc ; Kana      ; Katakana
9191 sc ; Khmr      ; Khmer
9192 sc ; Knda      ; Kannada
9193 sc ; Laoo      ; Lao
9194 sc ; Latn      ; Latin
9195 sc ; Mlym      ; Malayalam
9196 sc ; Mong      ; Mongolian
9197 sc ; Mymr      ; Myanmar
9198 sc ; Ogam      ; Ogham
9199 sc ; Orya      ; Oriya
9200 sc ; Qaai      ; Inherited
9201 sc ; Runr      ; Runic
9202 sc ; Sinh      ; Sinhala
9203 sc ; Syrc      ; Syriac
9204 sc ; Taml      ; Tamil
9205 sc ; Telu      ; Telugu
9206 sc ; Thaa      ; Thaana
9207 sc ; Thai      ; Thai
9208 sc ; Tibt      ; Tibetan
9209 sc ; Yiii      ; Yi
9210 sc ; Zyyy      ; Common
9211 END
9212     }
9213
9214     if ($v_version ge v2.0.0) {
9215         push @return, split /\n/, <<'END';
9216 dt ; com       ; compat
9217 dt ; nar       ; narrow
9218 dt ; sml       ; small
9219 dt ; vert      ; vertical
9220 dt ; wide      ; wide
9221
9222 gc ; Cf        ; Format
9223 gc ; Cs        ; Surrogate
9224 gc ; Lt        ; Titlecase_Letter
9225 gc ; Me        ; Enclosing_Mark
9226 gc ; Nl        ; Letter_Number
9227 gc ; Pc        ; Connector_Punctuation
9228 gc ; Sk        ; Modifier_Symbol
9229 END
9230     }
9231     if ($v_version ge v2.1.2) {
9232         push @return, "bc ; S         ; Segment_Separator\n";
9233     }
9234     if ($v_version ge v2.1.5) {
9235         push @return, split /\n/, <<'END';
9236 gc ; Pf        ; Final_Punctuation
9237 gc ; Pi        ; Initial_Punctuation
9238 END
9239     }
9240     if ($v_version ge v2.1.8) {
9241         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
9242     }
9243
9244     if ($v_version ge v3.0.0) {
9245         push @return, split /\n/, <<'END';
9246 bc ; AL        ; Arabic_Letter
9247 bc ; BN        ; Boundary_Neutral
9248 bc ; LRE       ; Left_To_Right_Embedding
9249 bc ; LRO       ; Left_To_Right_Override
9250 bc ; NSM       ; Nonspacing_Mark
9251 bc ; PDF       ; Pop_Directional_Format
9252 bc ; RLE       ; Right_To_Left_Embedding
9253 bc ; RLO       ; Right_To_Left_Override
9254
9255 ccc; 233; DB   ; Double_Below
9256 END
9257     }
9258
9259     if ($v_version ge v3.1.0) {
9260         push @return, "ccc; 226; R    ; Right\n";
9261     }
9262
9263     return @return;
9264 }
9265
9266 sub output_perl_charnames_line ($$) {
9267
9268     # Output the entries in Perl_charnames specially, using 5 digits instead
9269     # of four.  This makes the entries a constant length, and simplifies
9270     # charnames.pm which this table is for.  Unicode can have 6 digit
9271     # ordinals, but they are all private use or noncharacters which do not
9272     # have names, so won't be in this table.
9273
9274     return sprintf "%05X\t%s\n", $_[0], $_[1];
9275 }
9276
9277 { # Closure
9278     # This is used to store the range list of all the code points usable when
9279     # the little used $compare_versions feature is enabled.
9280     my $compare_versions_range_list;
9281
9282     sub process_generic_property_file {
9283         # This processes a file containing property mappings and puts them
9284         # into internal map tables.  It should be used to handle any property
9285         # files that have mappings from a code point or range thereof to
9286         # something else.  This means almost all the UCD .txt files.
9287         # each_line_handlers() should be set to adjust the lines of these
9288         # files, if necessary, to what this routine understands:
9289         #
9290         # 0374          ; NFD_QC; N
9291         # 003C..003E    ; Math
9292         #
9293         # the fields are: "codepoint-range ; property; map"
9294         #
9295         # meaning the codepoints in the range all have the value 'map' under
9296         # 'property'.
9297         # Beginning and trailing white space in each field are not significant.
9298         # Note there is not a trailing semi-colon in the above.  A trailing
9299         # semi-colon means the map is a null-string.  An omitted map, as
9300         # opposed to a null-string, is assumed to be 'Y', based on Unicode
9301         # table syntax.  (This could have been hidden from this routine by
9302         # doing it in the $file object, but that would require parsing of the
9303         # line there, so would have to parse it twice, or change the interface
9304         # to pass this an array.  So not done.)
9305         #
9306         # The map field may begin with a sequence of commands that apply to
9307         # this range.  Each such command begins and ends with $CMD_DELIM.
9308         # These are used to indicate, for example, that the mapping for a
9309         # range has a non-default type.
9310         #
9311         # This loops through the file, calling it's next_line() method, and
9312         # then taking the map and adding it to the property's table.
9313         # Complications arise because any number of properties can be in the
9314         # file, in any order, interspersed in any way.  The first time a
9315         # property is seen, it gets information about that property and
9316         # caches it for quick retrieval later.  It also normalizes the maps
9317         # so that only one of many synonyms is stored.  The Unicode input
9318         # files do use some multiple synonyms.
9319
9320         my $file = shift;
9321         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9322
9323         my %property_info;               # To keep track of what properties
9324                                          # have already had entries in the
9325                                          # current file, and info about each,
9326                                          # so don't have to recompute.
9327         my $property_name;               # property currently being worked on
9328         my $property_type;               # and its type
9329         my $previous_property_name = ""; # name from last time through loop
9330         my $property_object;             # pointer to the current property's
9331                                          # object
9332         my $property_addr;               # the address of that object
9333         my $default_map;                 # the string that code points missing
9334                                          # from the file map to
9335         my $default_table;               # For non-string properties, a
9336                                          # reference to the match table that
9337                                          # will contain the list of code
9338                                          # points that map to $default_map.
9339
9340         # Get the next real non-comment line
9341         LINE:
9342         while ($file->next_line) {
9343
9344             # Default replacement type; means that if parts of the range have
9345             # already been stored in our tables, the new map overrides them if
9346             # they differ more than cosmetically
9347             my $replace = $IF_NOT_EQUIVALENT;
9348             my $map_type;            # Default type for the map of this range
9349
9350             #local $to_trace = 1 if main::DEBUG;
9351             trace $_ if main::DEBUG && $to_trace;
9352
9353             # Split the line into components
9354             my ($range, $property_name, $map, @remainder)
9355                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9356
9357             # If more or less on the line than we are expecting, warn and skip
9358             # the line
9359             if (@remainder) {
9360                 $file->carp_bad_line('Extra fields');
9361                 next LINE;
9362             }
9363             elsif ( ! defined $property_name) {
9364                 $file->carp_bad_line('Missing property');
9365                 next LINE;
9366             }
9367
9368             # Examine the range.
9369             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9370             {
9371                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9372                 next LINE;
9373             }
9374             my $low = hex $1;
9375             my $high = (defined $2) ? hex $2 : $low;
9376
9377             # For the very specialized case of comparing two Unicode
9378             # versions...
9379             if (DEBUG && $compare_versions) {
9380                 if ($property_name eq 'Age') {
9381
9382                     # Only allow code points at least as old as the version
9383                     # specified.
9384                     my $age = pack "C*", split(/\./, $map);        # v string
9385                     next LINE if $age gt $compare_versions;
9386                 }
9387                 else {
9388
9389                     # Again, we throw out code points younger than those of
9390                     # the specified version.  By now, the Age property is
9391                     # populated.  We use the intersection of each input range
9392                     # with this property to find what code points in it are
9393                     # valid.   To do the intersection, we have to convert the
9394                     # Age property map to a Range_list.  We only have to do
9395                     # this once.
9396                     if (! defined $compare_versions_range_list) {
9397                         my $age = property_ref('Age');
9398                         if (! -e 'DAge.txt') {
9399                             croak "Need to have 'DAge.txt' file to do version comparison";
9400                         }
9401                         elsif ($age->count == 0) {
9402                             croak "The 'Age' table is empty, but its file exists";
9403                         }
9404                         $compare_versions_range_list
9405                                         = Range_List->new(Initialize => $age);
9406                     }
9407
9408                     # An undefined map is always 'Y'
9409                     $map = 'Y' if ! defined $map;
9410
9411                     # Calculate the intersection of the input range with the
9412                     # code points that are known in the specified version
9413                     my @ranges = ($compare_versions_range_list
9414                                   & Range->new($low, $high))->ranges;
9415
9416                     # If the intersection is empty, throw away this range
9417                     next LINE unless @ranges;
9418
9419                     # Only examine the first range this time through the loop.
9420                     my $this_range = shift @ranges;
9421
9422                     # Put any remaining ranges in the queue to be processed
9423                     # later.  Note that there is unnecessary work here, as we
9424                     # will do the intersection again for each of these ranges
9425                     # during some future iteration of the LINE loop, but this
9426                     # code is not used in production.  The later intersections
9427                     # are guaranteed to not splinter, so this will not become
9428                     # an infinite loop.
9429                     my $line = join ';', $property_name, $map;
9430                     foreach my $range (@ranges) {
9431                         $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9432                                                             $range->start,
9433                                                             $range->end,
9434                                                             $line));
9435                     }
9436
9437                     # And process the first range, like any other.
9438                     $low = $this_range->start;
9439                     $high = $this_range->end;
9440                 }
9441             } # End of $compare_versions
9442
9443             # If changing to a new property, get the things constant per
9444             # property
9445             if ($previous_property_name ne $property_name) {
9446
9447                 $property_object = property_ref($property_name);
9448                 if (! defined $property_object) {
9449                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
9450                     next LINE;
9451                 }
9452                 { no overloading; $property_addr = pack 'J', $property_object; }
9453
9454                 # Defer changing names until have a line that is acceptable
9455                 # (the 'next' statement above means is unacceptable)
9456                 $previous_property_name = $property_name;
9457
9458                 # If not the first time for this property, retrieve info about
9459                 # it from the cache
9460                 if (defined ($property_info{$property_addr}{'type'})) {
9461                     $property_type = $property_info{$property_addr}{'type'};
9462                     $default_map = $property_info{$property_addr}{'default'};
9463                     $map_type
9464                         = $property_info{$property_addr}{'pseudo_map_type'};
9465                     $default_table
9466                             = $property_info{$property_addr}{'default_table'};
9467                 }
9468                 else {
9469
9470                     # Here, is the first time for this property.  Set up the
9471                     # cache.
9472                     $property_type = $property_info{$property_addr}{'type'}
9473                                    = $property_object->type;
9474                     $map_type
9475                         = $property_info{$property_addr}{'pseudo_map_type'}
9476                         = $property_object->pseudo_map_type;
9477
9478                     # The Unicode files are set up so that if the map is not
9479                     # defined, it is a binary property
9480                     if (! defined $map && $property_type != $BINARY) {
9481                         if ($property_type != $UNKNOWN
9482                             && $property_type != $NON_STRING)
9483                         {
9484                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
9485                         }
9486                         else {
9487                             $property_object->set_type($BINARY);
9488                             $property_type
9489                                 = $property_info{$property_addr}{'type'}
9490                                 = $BINARY;
9491                         }
9492                     }
9493
9494                     # Get any @missings default for this property.  This
9495                     # should precede the first entry for the property in the
9496                     # input file, and is located in a comment that has been
9497                     # stored by the Input_file class until we access it here.
9498                     # It's possible that there is more than one such line
9499                     # waiting for us; collect them all, and parse
9500                     my @missings_list = $file->get_missings
9501                                             if $file->has_missings_defaults;
9502                     foreach my $default_ref (@missings_list) {
9503                         my $default = $default_ref->[0];
9504                         my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
9505
9506                         # For string properties, the default is just what the
9507                         # file says, but non-string properties should already
9508                         # have set up a table for the default property value;
9509                         # use the table for these, so can resolve synonyms
9510                         # later to a single standard one.
9511                         if ($property_type == $STRING
9512                             || $property_type == $UNKNOWN)
9513                         {
9514                             $property_info{$addr}{'missings'} = $default;
9515                         }
9516                         else {
9517                             $property_info{$addr}{'missings'}
9518                                         = $property_object->table($default);
9519                         }
9520                     }
9521
9522                     # Finished storing all the @missings defaults in the input
9523                     # file so far.  Get the one for the current property.
9524                     my $missings = $property_info{$property_addr}{'missings'};
9525
9526                     # But we likely have separately stored what the default
9527                     # should be.  (This is to accommodate versions of the
9528                     # standard where the @missings lines are absent or
9529                     # incomplete.)  Hopefully the two will match.  But check
9530                     # it out.
9531                     $default_map = $property_object->default_map;
9532
9533                     # If the map is a ref, it means that the default won't be
9534                     # processed until later, so undef it, so next few lines
9535                     # will redefine it to something that nothing will match
9536                     undef $default_map if ref $default_map;
9537
9538                     # Create a $default_map if don't have one; maybe a dummy
9539                     # that won't match anything.
9540                     if (! defined $default_map) {
9541
9542                         # Use any @missings line in the file.
9543                         if (defined $missings) {
9544                             if (ref $missings) {
9545                                 $default_map = $missings->full_name;
9546                                 $default_table = $missings;
9547                             }
9548                             else {
9549                                 $default_map = $missings;
9550                             }
9551
9552                             # And store it with the property for outside use.
9553                             $property_object->set_default_map($default_map);
9554                         }
9555                         else {
9556
9557                             # Neither an @missings nor a default map.  Create
9558                             # a dummy one, so won't have to test definedness
9559                             # in the main loop.
9560                             $default_map = '_Perl This will never be in a file
9561                                             from Unicode';
9562                         }
9563                     }
9564
9565                     # Here, we have $default_map defined, possibly in terms of
9566                     # $missings, but maybe not, and possibly is a dummy one.
9567                     if (defined $missings) {
9568
9569                         # Make sure there is no conflict between the two.
9570                         # $missings has priority.
9571                         if (ref $missings) {
9572                             $default_table
9573                                         = $property_object->table($default_map);
9574                             if (! defined $default_table
9575                                 || $default_table != $missings)
9576                             {
9577                                 if (! defined $default_table) {
9578                                     $default_table = $UNDEF;
9579                                 }
9580                                 $file->carp_bad_line(<<END
9581 The \@missings line for $property_name in $file says that missings default to
9582 $missings, but we expect it to be $default_table.  $missings used.
9583 END
9584                                 );
9585                                 $default_table = $missings;
9586                                 $default_map = $missings->full_name;
9587                             }
9588                             $property_info{$property_addr}{'default_table'}
9589                                                         = $default_table;
9590                         }
9591                         elsif ($default_map ne $missings) {
9592                             $file->carp_bad_line(<<END
9593 The \@missings line for $property_name in $file says that missings default to
9594 $missings, but we expect it to be $default_map.  $missings used.
9595 END
9596                             );
9597                             $default_map = $missings;
9598                         }
9599                     }
9600
9601                     $property_info{$property_addr}{'default'}
9602                                                     = $default_map;
9603
9604                     # If haven't done so already, find the table corresponding
9605                     # to this map for non-string properties.
9606                     if (! defined $default_table
9607                         && $property_type != $STRING
9608                         && $property_type != $UNKNOWN)
9609                     {
9610                         $default_table = $property_info{$property_addr}
9611                                                         {'default_table'}
9612                                     = $property_object->table($default_map);
9613                     }
9614                 } # End of is first time for this property
9615             } # End of switching properties.
9616
9617             # Ready to process the line.
9618             # The Unicode files are set up so that if the map is not defined,
9619             # it is a binary property with value 'Y'
9620             if (! defined $map) {
9621                 $map = 'Y';
9622             }
9623             else {
9624
9625                 # If the map begins with a special command to us (enclosed in
9626                 # delimiters), extract the command(s).
9627                 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9628                     my $command = $1;
9629                     if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
9630                         $replace = $1;
9631                     }
9632                     elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
9633                         $map_type = $1;
9634                     }
9635                     else {
9636                         $file->carp_bad_line("Unknown command line: '$1'");
9637                         next LINE;
9638                     }
9639                 }
9640             }
9641
9642             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9643             {
9644
9645                 # Here, we have a map to a particular code point, and the
9646                 # default map is to a code point itself.  If the range
9647                 # includes the particular code point, change that portion of
9648                 # the range to the default.  This makes sure that in the final
9649                 # table only the non-defaults are listed.
9650                 my $decimal_map = hex $map;
9651                 if ($low <= $decimal_map && $decimal_map <= $high) {
9652
9653                     # If the range includes stuff before or after the map
9654                     # we're changing, split it and process the split-off parts
9655                     # later.
9656                     if ($low < $decimal_map) {
9657                         $file->insert_adjusted_lines(
9658                                             sprintf("%04X..%04X; %s; %s",
9659                                                     $low,
9660                                                     $decimal_map - 1,
9661                                                     $property_name,
9662                                                     $map));
9663                     }
9664                     if ($high > $decimal_map) {
9665                         $file->insert_adjusted_lines(
9666                                             sprintf("%04X..%04X; %s; %s",
9667                                                     $decimal_map + 1,
9668                                                     $high,
9669                                                     $property_name,
9670                                                     $map));
9671                     }
9672                     $low = $high = $decimal_map;
9673                     $map = $CODE_POINT;
9674                 }
9675             }
9676
9677             # If we can tell that this is a synonym for the default map, use
9678             # the default one instead.
9679             if ($property_type != $STRING
9680                 && $property_type != $UNKNOWN)
9681             {
9682                 my $table = $property_object->table($map);
9683                 if (defined $table && $table == $default_table) {
9684                     $map = $default_map;
9685                 }
9686             }
9687
9688             # And figure out the map type if not known.
9689             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9690                 if ($map eq "") {   # Nulls are always $NULL map type
9691                     $map_type = $NULL;
9692                 } # Otherwise, non-strings, and those that don't allow
9693                   # $MULTI_CP, and those that aren't multiple code points are
9694                   # 0
9695                 elsif
9696                    (($property_type != $STRING && $property_type != $UNKNOWN)
9697                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9698                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
9699                 {
9700                     $map_type = 0;
9701                 }
9702                 else {
9703                     $map_type = $MULTI_CP;
9704                 }
9705             }
9706
9707             $property_object->add_map($low, $high,
9708                                         $map,
9709                                         Type => $map_type,
9710                                         Replace => $replace);
9711         } # End of loop through file's lines
9712
9713         return;
9714     }
9715 }
9716
9717 { # Closure for UnicodeData.txt handling
9718
9719     # This file was the first one in the UCD; its design leads to some
9720     # awkwardness in processing.  Here is a sample line:
9721     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9722     # The fields in order are:
9723     my $i = 0;            # The code point is in field 0, and is shifted off.
9724     my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
9725     my $CATEGORY = $i++;  # category (e.g. "Lu")
9726     my $CCC = $i++;       # Canonical combining class (e.g. "230")
9727     my $BIDI = $i++;      # directional class (e.g. "L")
9728     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
9729     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
9730     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9731                                          # Dual-use in this program; see below
9732     my $NUMERIC = $i++;   # numeric value
9733     my $MIRRORED = $i++;  # ? mirrored
9734     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9735     my $COMMENT = $i++;   # iso comment
9736     my $UPPER = $i++;     # simple uppercase mapping
9737     my $LOWER = $i++;     # simple lowercase mapping
9738     my $TITLE = $i++;     # simple titlecase mapping
9739     my $input_field_count = $i;
9740
9741     # This routine in addition outputs these extra fields:
9742     my $DECOMP_TYPE = $i++; # Decomposition type
9743
9744     # These fields are modifications of ones above, and are usually
9745     # suppressed; they must come last, as for speed, the loop upper bound is
9746     # normally set to ignore them
9747     my $NAME = $i++;        # This is the strict name field, not the one that
9748                             # charnames uses.
9749     my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
9750                             # by Unicode::Normalize
9751     my $last_field = $i - 1;
9752
9753     # All these are read into an array for each line, with the indices defined
9754     # above.  The empty fields in the example line above indicate that the
9755     # value is defaulted.  The handler called for each line of the input
9756     # changes these to their defaults.
9757
9758     # Here are the official names of the properties, in a parallel array:
9759     my @field_names;
9760     $field_names[$BIDI] = 'Bidi_Class';
9761     $field_names[$CATEGORY] = 'General_Category';
9762     $field_names[$CCC] = 'Canonical_Combining_Class';
9763     $field_names[$CHARNAME] = 'Perl_Charnames';
9764     $field_names[$COMMENT] = 'ISO_Comment';
9765     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9766     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
9767     $field_names[$LOWER] = 'Lowercase_Mapping';
9768     $field_names[$MIRRORED] = 'Bidi_Mirrored';
9769     $field_names[$NAME] = 'Name';
9770     $field_names[$NUMERIC] = 'Numeric_Value';
9771     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9772     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9773     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
9774     $field_names[$TITLE] = 'Titlecase_Mapping';
9775     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
9776     $field_names[$UPPER] = 'Uppercase_Mapping';
9777
9778     # Some of these need a little more explanation:
9779     # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9780     #   property, but is used in calculating the Numeric_Type.  Perl however,
9781     #   creates a file from this field, so a Perl property is created from it.
9782     # Similarly, the Other_Digit field is used only for calculating the
9783     #   Numeric_Type, and so it can be safely re-used as the place to store
9784     #   the value for Numeric_Type; hence it is referred to as
9785     #   $NUMERIC_TYPE_OTHER_DIGIT.
9786     # The input field named $PERL_DECOMPOSITION is a combination of both the
9787     #   decomposition mapping and its type.  Perl creates a file containing
9788     #   exactly this field, so it is used for that.  The two properties are
9789     #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9790     #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
9791     #   output it), as Perl doesn't use it directly.
9792     # The input field named here $CHARNAME is used to construct the
9793     #   Perl_Charnames property, which is a combination of the Name property
9794     #   (which the input field contains), and the Unicode_1_Name property, and
9795     #   others from other files.  Since, the strict Name property is not used
9796     #   by Perl, this field is used for the table that Perl does use.  The
9797     #   strict Name property table is usually suppressed (unless the lists are
9798     #   changed to output it), so it is accumulated in a separate field,
9799     #   $NAME, which to save time is discarded unless the table is actually to
9800     #   be output
9801
9802     # This file is processed like most in this program.  Control is passed to
9803     # process_generic_property_file() which calls filter_UnicodeData_line()
9804     # for each input line.  This filter converts the input into line(s) that
9805     # process_generic_property_file() understands.  There is also a setup
9806     # routine called before any of the file is processed, and a handler for
9807     # EOF processing, all in this closure.
9808
9809     # A huge speed-up occurred at the cost of some added complexity when these
9810     # routines were altered to buffer the outputs into ranges.  Almost all the
9811     # lines of the input file apply to just one code point, and for most
9812     # properties, the map for the next code point up is the same as the
9813     # current one.  So instead of creating a line for each property for each
9814     # input line, filter_UnicodeData_line() remembers what the previous map
9815     # of a property was, and doesn't generate a line to pass on until it has
9816     # to, as when the map changes; and that passed-on line encompasses the
9817     # whole contiguous range of code points that have the same map for that
9818     # property.  This means a slight amount of extra setup, and having to
9819     # flush these buffers on EOF, testing if the maps have changed, plus
9820     # remembering state information in the closure.  But it means a lot less
9821     # real time in not having to change the data base for each property on
9822     # each line.
9823
9824     # Another complication is that there are already a few ranges designated
9825     # in the input.  There are two lines for each, with the same maps except
9826     # the code point and name on each line.  This was actually the hardest
9827     # thing to design around.  The code points in those ranges may actually
9828     # have real maps not given by these two lines.  These maps will either
9829     # be algorithmically determinable, or in the extracted files furnished
9830     # with the UCD.  In the event of conflicts between these extracted files,
9831     # and this one, Unicode says that this one prevails.  But it shouldn't
9832     # prevail for conflicts that occur in these ranges.  The data from the
9833     # extracted files prevails in those cases.  So, this program is structured
9834     # so that those files are processed first, storing maps.  Then the other
9835     # files are processed, generally overwriting what the extracted files
9836     # stored.  But just the range lines in this input file are processed
9837     # without overwriting.  This is accomplished by adding a special string to
9838     # the lines output to tell process_generic_property_file() to turn off the
9839     # overwriting for just this one line.
9840     # A similar mechanism is used to tell it that the map is of a non-default
9841     # type.
9842
9843     sub setup_UnicodeData { # Called before any lines of the input are read
9844         my $file = shift;
9845         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9846
9847         # Create a new property specially located that is a combination of the
9848         # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9849         # Name_Alias properties.  (The final duplicates elements of the
9850         # first.)  A comment for it will later be constructed based on the
9851         # actual properties present and used
9852         $perl_charname = Property->new('Perl_Charnames',
9853                        Core_Access => '\N{...} and "use charnames"',
9854                        Default_Map => "",
9855                        Directory => File::Spec->curdir(),
9856                        File => 'Name',
9857                        Internal_Only_Warning => 1,
9858                        Perl_Extension => 1,
9859                        Range_Size_1 => \&output_perl_charnames_line,
9860                        Type => $STRING,
9861                        );
9862
9863         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
9864                                         Directory => File::Spec->curdir(),
9865                                         File => 'Decomposition',
9866                                         Format => $DECOMP_STRING_FORMAT,
9867                                         Internal_Only_Warning => 1,
9868                                         Perl_Extension => 1,
9869                                         Default_Map => $CODE_POINT,
9870
9871                                         # normalize.pm can't cope with these
9872                                         Output_Range_Counts => 0,
9873
9874                                         # This is a specially formatted table
9875                                         # explicitly for normalize.pm, which
9876                                         # is expecting a particular format,
9877                                         # which means that mappings containing
9878                                         # multiple code points are in the main
9879                                         # body of the table
9880                                         Map_Type => $COMPUTE_NO_MULTI_CP,
9881                                         Type => $STRING,
9882                                         );
9883         $Perl_decomp->add_comment(join_lines(<<END
9884 This mapping is a combination of the Unicode 'Decomposition_Type' and
9885 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
9886 identical to the official Unicode 'Decomposition_Mapping'  property except for
9887 two things:
9888  1) It omits the algorithmically determinable Hangul syllable decompositions,
9889 which normalize.pm handles algorithmically.
9890  2) It contains the decomposition type as well.  Non-canonical decompositions
9891 begin with a word in angle brackets, like <super>, which denotes the
9892 compatible decomposition type.  If the map does not begin with the <angle
9893 brackets>, the decomposition is canonical.
9894 END
9895         ));
9896
9897         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9898                                         Default_Map => "",
9899                                         Perl_Extension => 1,
9900                                         File => 'Digit',    # Trad. location
9901                                         Directory => $map_directory,
9902                                         Type => $STRING,
9903                                         Range_Size_1 => 1,
9904                                         );
9905         $Decimal_Digit->add_comment(join_lines(<<END
9906 This file gives the mapping of all code points which represent a single
9907 decimal digit [0-9] to their respective digits.  For example, the code point
9908 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
9909 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9910 numerals.
9911 END
9912         ));
9913
9914         # These properties are not used for generating anything else, and are
9915         # usually not output.  By making them last in the list, we can just
9916         # change the high end of the loop downwards to avoid the work of
9917         # generating a table(s) that is/are just going to get thrown away.
9918         if (! property_ref('Decomposition_Mapping')->to_output_map
9919             && ! property_ref('Name')->to_output_map)
9920         {
9921             $last_field = min($NAME, $DECOMP_MAP) - 1;
9922         } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
9923             $last_field = $DECOMP_MAP;
9924         } elsif (property_ref('Name')->to_output_map) {
9925             $last_field = $NAME;
9926         }
9927         return;
9928     }
9929
9930     my $first_time = 1;                 # ? Is this the first line of the file
9931     my $in_range = 0;                   # ? Are we in one of the file's ranges
9932     my $previous_cp;                    # hex code point of previous line
9933     my $decimal_previous_cp = -1;       # And its decimal equivalent
9934     my @start;                          # For each field, the current starting
9935                                         # code point in hex for the range
9936                                         # being accumulated.
9937     my @fields;                         # The input fields;
9938     my @previous_fields;                # And those from the previous call
9939
9940     sub filter_UnicodeData_line {
9941         # Handle a single input line from UnicodeData.txt; see comments above
9942         # Conceptually this takes a single line from the file containing N
9943         # properties, and converts it into N lines with one property per line,
9944         # which is what the final handler expects.  But there are
9945         # complications due to the quirkiness of the input file, and to save
9946         # time, it accumulates ranges where the property values don't change
9947         # and only emits lines when necessary.  This is about an order of
9948         # magnitude fewer lines emitted.
9949
9950         my $file = shift;
9951         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9952
9953         # $_ contains the input line.
9954         # -1 in split means retain trailing null fields
9955         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9956
9957         #local $to_trace = 1 if main::DEBUG;
9958         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9959         if (@fields > $input_field_count) {
9960             $file->carp_bad_line('Extra fields');
9961             $_ = "";
9962             return;
9963         }
9964
9965         my $decimal_cp = hex $cp;
9966
9967         # We have to output all the buffered ranges when the next code point
9968         # is not exactly one after the previous one, which means there is a
9969         # gap in the ranges.
9970         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9971
9972         # The decomposition mapping field requires special handling.  It looks
9973         # like either:
9974         #
9975         # <compat> 0032 0020
9976         # 0041 0300
9977         #
9978         # The decomposition type is enclosed in <brackets>; if missing, it
9979         # means the type is canonical.  There are two decomposition mapping
9980         # tables: the one for use by Perl's normalize.pm has a special format
9981         # which is this field intact; the other, for general use is of
9982         # standard format.  In either case we have to find the decomposition
9983         # type.  Empty fields have None as their type, and map to the code
9984         # point itself
9985         if ($fields[$PERL_DECOMPOSITION] eq "") {
9986             $fields[$DECOMP_TYPE] = 'None';
9987             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9988         }
9989         else {
9990             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9991                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
9992             if (! defined $fields[$DECOMP_TYPE]) {
9993                 $fields[$DECOMP_TYPE] = 'Canonical';
9994                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9995             }
9996             else {
9997                 $fields[$DECOMP_MAP] = $map;
9998             }
9999         }
10000
10001         # The 3 numeric fields also require special handling.  The 2 digit
10002         # fields must be either empty or match the number field.  This means
10003         # that if it is empty, they must be as well, and the numeric type is
10004         # None, and the numeric value is 'Nan'.
10005         # The decimal digit field must be empty or match the other digit
10006         # field.  If the decimal digit field is non-empty, the code point is
10007         # a decimal digit, and the other two fields will have the same value.
10008         # If it is empty, but the other digit field is non-empty, the code
10009         # point is an 'other digit', and the number field will have the same
10010         # value as the other digit field.  If the other digit field is empty,
10011         # but the number field is non-empty, the code point is a generic
10012         # numeric type.
10013         if ($fields[$NUMERIC] eq "") {
10014             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10015                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10016             ) {
10017                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
10018             }
10019             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10020             $fields[$NUMERIC] = 'NaN';
10021         }
10022         else {
10023             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10024             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10025                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10026                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10027             }
10028             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10029                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10030                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10031             }
10032             else {
10033                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10034
10035                 # Rationals require extra effort.
10036                 register_fraction($fields[$NUMERIC])
10037                                                 if $fields[$NUMERIC] =~ qr{/};
10038             }
10039         }
10040
10041         # For the properties that have empty fields in the file, and which
10042         # mean something different from empty, change them to that default.
10043         # Certain fields just haven't been empty so far in any Unicode
10044         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10045         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
10046         # the defaults; which are very unlikely to ever change.
10047         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10048         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10049
10050         # UAX44 says that if title is empty, it is the same as whatever upper
10051         # is,
10052         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10053
10054         # There are a few pairs of lines like:
10055         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10056         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10057         # that define ranges.  These should be processed after the fields are
10058         # adjusted above, as they may override some of them; but mostly what
10059         # is left is to possibly adjust the $CHARNAME field.  The names of all the
10060         # paired lines start with a '<', but this is also true of '<control>,
10061         # which isn't one of these special ones.
10062         if ($fields[$CHARNAME] eq '<control>') {
10063
10064             # Some code points in this file have the pseudo-name
10065             # '<control>', but the official name for such ones is the null
10066             # string.  For charnames.pm, we use the Unicode version 1 name
10067             $fields[$NAME] = "";
10068             $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
10069
10070             # We had better not be in between range lines.
10071             if ($in_range) {
10072                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10073                 $in_range = 0;
10074             }
10075         }
10076         elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
10077
10078             # Here is a non-range line.  We had better not be in between range
10079             # lines.
10080             if ($in_range) {
10081                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10082                 $in_range = 0;
10083             }
10084             if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10085
10086                 # These are code points whose names end in their code points,
10087                 # which means the names are algorithmically derivable from the
10088                 # code points.  To shorten the output Name file, the algorithm
10089                 # for deriving these is placed in the file instead of each
10090                 # code point, so they have map type $CP_IN_NAME
10091                 $fields[$CHARNAME] = $CMD_DELIM
10092                                  . $MAP_TYPE_CMD
10093                                  . '='
10094                                  . $CP_IN_NAME
10095                                  . $CMD_DELIM
10096                                  . $fields[$CHARNAME];
10097             }
10098             $fields[$NAME] = $fields[$CHARNAME];
10099         }
10100         elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10101             $fields[$CHARNAME] = $fields[$NAME] = $1;
10102
10103             # Here we are at the beginning of a range pair.
10104             if ($in_range) {
10105                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
10106             }
10107             $in_range = 1;
10108
10109             # Because the properties in the range do not overwrite any already
10110             # in the db, we must flush the buffers of what's already there, so
10111             # they get handled in the normal scheme.
10112             $force_output = 1;
10113
10114         }
10115         elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10116             $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
10117             $_ = "";
10118             return;
10119         }
10120         else { # Here, we are at the last line of a range pair.
10121
10122             if (! $in_range) {
10123                 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
10124                 $_ = "";
10125                 return;
10126             }
10127             $in_range = 0;
10128
10129             $fields[$NAME] = $fields[$CHARNAME];
10130
10131             # Check that the input is valid: that the closing of the range is
10132             # the same as the beginning.
10133             foreach my $i (0 .. $last_field) {
10134                 next if $fields[$i] eq $previous_fields[$i];
10135                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
10136             }
10137
10138             # The processing differs depending on the type of range,
10139             # determined by its $CHARNAME
10140             if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
10141
10142                 # Check that the data looks right.
10143                 if ($decimal_previous_cp != $SBase) {
10144                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
10145                 }
10146                 if ($decimal_cp != $SBase + $SCount - 1) {
10147                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
10148                 }
10149
10150                 # The Hangul syllable range has a somewhat complicated name
10151                 # generation algorithm.  Each code point in it has a canonical
10152                 # decomposition also computable by an algorithm.  The
10153                 # perl decomposition map table built from these is used only
10154                 # by normalize.pm, which has the algorithm built in it, so the
10155                 # decomposition maps are not needed, and are large, so are
10156                 # omitted from it.  If the full decomposition map table is to
10157                 # be output, the decompositions are generated for it, in the
10158                 # EOF handling code for this input file.
10159
10160                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10161
10162                 # This range is stored in our internal structure with its
10163                 # own map type, different from all others.
10164                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10165                                         = $CMD_DELIM
10166                                           . $MAP_TYPE_CMD
10167                                           . '='
10168                                           . $HANGUL_SYLLABLE
10169                                           . $CMD_DELIM
10170                                           . $fields[$CHARNAME];
10171             }
10172             elsif ($fields[$CHARNAME] =~ /^CJK/) {
10173
10174                 # The name for these contains the code point itself, and all
10175                 # are defined to have the same base name, regardless of what
10176                 # is in the file.  They are stored in our internal structure
10177                 # with a map type of $CP_IN_NAME
10178                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10179                                         = $CMD_DELIM
10180                                            . $MAP_TYPE_CMD
10181                                            . '='
10182                                            . $CP_IN_NAME
10183                                            . $CMD_DELIM
10184                                            . 'CJK UNIFIED IDEOGRAPH';
10185
10186             }
10187             elsif ($fields[$CATEGORY] eq 'Co'
10188                      || $fields[$CATEGORY] eq 'Cs')
10189             {
10190                 # The names of all the code points in these ranges are set to
10191                 # null, as there are no names for the private use and
10192                 # surrogate code points.
10193
10194                 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
10195             }
10196             else {
10197                 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
10198             }
10199
10200             # The first line of the range caused everything else to be output,
10201             # and then its values were stored as the beginning values for the
10202             # next set of ranges, which this one ends.  Now, for each value,
10203             # add a command to tell the handler that these values should not
10204             # replace any existing ones in our database.
10205             foreach my $i (0 .. $last_field) {
10206                 $previous_fields[$i] = $CMD_DELIM
10207                                         . $REPLACE_CMD
10208                                         . '='
10209                                         . $NO
10210                                         . $CMD_DELIM
10211                                         . $previous_fields[$i];
10212             }
10213
10214             # And change things so it looks like the entire range has been
10215             # gone through with this being the final part of it.  Adding the
10216             # command above to each field will cause this range to be flushed
10217             # during the next iteration, as it guaranteed that the stored
10218             # field won't match whatever value the next one has.
10219             $previous_cp = $cp;
10220             $decimal_previous_cp = $decimal_cp;
10221
10222             # We are now set up for the next iteration; so skip the remaining
10223             # code in this subroutine that does the same thing, but doesn't
10224             # know about these ranges.
10225             $_ = "";
10226
10227             return;
10228         }
10229
10230         # On the very first line, we fake it so the code below thinks there is
10231         # nothing to output, and initialize so that when it does get output it
10232         # uses the first line's values for the lowest part of the range.
10233         # (One could avoid this by using peek(), but then one would need to
10234         # know the adjustments done above and do the same ones in the setup
10235         # routine; not worth it)
10236         if ($first_time) {
10237             $first_time = 0;
10238             @previous_fields = @fields;
10239             @start = ($cp) x scalar @fields;
10240             $decimal_previous_cp = $decimal_cp - 1;
10241         }
10242
10243         # For each field, output the stored up ranges that this code point
10244         # doesn't fit in.  Earlier we figured out if all ranges should be
10245         # terminated because of changing the replace or map type styles, or if
10246         # there is a gap between this new code point and the previous one, and
10247         # that is stored in $force_output.  But even if those aren't true, we
10248         # need to output the range if this new code point's value for the
10249         # given property doesn't match the stored range's.
10250         #local $to_trace = 1 if main::DEBUG;
10251         foreach my $i (0 .. $last_field) {
10252             my $field = $fields[$i];
10253             if ($force_output || $field ne $previous_fields[$i]) {
10254
10255                 # Flush the buffer of stored values.
10256                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10257
10258                 # Start a new range with this code point and its value
10259                 $start[$i] = $cp;
10260                 $previous_fields[$i] = $field;
10261             }
10262         }
10263
10264         # Set the values for the next time.
10265         $previous_cp = $cp;
10266         $decimal_previous_cp = $decimal_cp;
10267
10268         # The input line has generated whatever adjusted lines are needed, and
10269         # should not be looked at further.
10270         $_ = "";
10271         return;
10272     }
10273
10274     sub EOF_UnicodeData {
10275         # Called upon EOF to flush the buffers, and create the Hangul
10276         # decomposition mappings if needed.
10277
10278         my $file = shift;
10279         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10280
10281         # Flush the buffers.
10282         foreach my $i (1 .. $last_field) {
10283             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10284         }
10285
10286         if (-e 'Jamo.txt') {
10287
10288             # The algorithm is published by Unicode, based on values in
10289             # Jamo.txt, (which should have been processed before this
10290             # subroutine), and the results left in %Jamo
10291             unless (%Jamo) {
10292                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
10293                 return;
10294             }
10295
10296             # If the full decomposition map table is being output, insert
10297             # into it the Hangul syllable mappings.  This is to avoid having
10298             # to publish a subroutine in it to compute them.  (which would
10299             # essentially be this code.)  This uses the algorithm published by
10300             # Unicode.
10301             if (property_ref('Decomposition_Mapping')->to_output_map) {
10302                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10303                     use integer;
10304                     my $SIndex = $S - $SBase;
10305                     my $L = $LBase + $SIndex / $NCount;
10306                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
10307                     my $T = $TBase + $SIndex % $TCount;
10308
10309                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10310                     my $decomposition = sprintf("%04X %04X", $L, $V);
10311                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10312                     $file->insert_adjusted_lines(
10313                                 sprintf("%04X; Decomposition_Mapping; %s",
10314                                         $S,
10315                                         $decomposition));
10316                 }
10317             }
10318         }
10319
10320         return;
10321     }
10322
10323     sub filter_v1_ucd {
10324         # Fix UCD lines in version 1.  This is probably overkill, but this
10325         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
10326         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
10327         #       removed.  This program retains them
10328         # 2)    didn't include ranges, which it should have, and which are now
10329         #       added in @corrected_lines below.  It was hand populated by
10330         #       taking the data from Version 2, verified by analyzing
10331         #       DAge.txt.
10332         # 3)    There is a syntax error in the entry for U+09F8 which could
10333         #       cause problems for utf8_heavy, and so is changed.  It's
10334         #       numeric value was simply a minus sign, without any number.
10335         #       (Eventually Unicode changed the code point to non-numeric.)
10336         # 4)    The decomposition types often don't match later versions
10337         #       exactly, and the whole syntax of that field is different; so
10338         #       the syntax is changed as well as the types to their later
10339         #       terminology.  Otherwise normalize.pm would be very unhappy
10340         # 5)    Many ccc classes are different.  These are left intact.
10341         # 6)    U+FF10 - U+FF19 are missing their numeric values in all three
10342         #       fields.  These are unchanged because it doesn't really cause
10343         #       problems for Perl.
10344         # 7)    A number of code points, such as controls, don't have their
10345         #       Unicode Version 1 Names in this file.  These are unchanged.
10346
10347         my @corrected_lines = split /\n/, <<'END';
10348 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
10349 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10350 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10351 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10352 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10353 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10354 END
10355
10356         my $file = shift;
10357         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10358
10359         #local $to_trace = 1 if main::DEBUG;
10360         trace $_ if main::DEBUG && $to_trace;
10361
10362         # -1 => retain trailing null fields
10363         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10364
10365         # At the first place that is wrong in the input, insert all the
10366         # corrections, replacing the wrong line.
10367         if ($code_point eq '4E00') {
10368             my @copy = @corrected_lines;
10369             $_ = shift @copy;
10370             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10371
10372             $file->insert_lines(@copy);
10373         }
10374
10375
10376         if ($fields[$NUMERIC] eq '-') {
10377             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
10378         }
10379
10380         if  ($fields[$PERL_DECOMPOSITION] ne "") {
10381
10382             # Several entries have this change to superscript 2 or 3 in the
10383             # middle.  Convert these to the modern version, which is to use
10384             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10385             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10386             # 'HHHH HHHH 00B3 HHHH'.
10387             # It turns out that all of these that don't have another
10388             # decomposition defined at the beginning of the line have the
10389             # <square> decomposition in later releases.
10390             if ($code_point ne '00B2' && $code_point ne '00B3') {
10391                 if  ($fields[$PERL_DECOMPOSITION]
10392                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10393                 {
10394                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10395                         $fields[$PERL_DECOMPOSITION] = '<square> '
10396                         . $fields[$PERL_DECOMPOSITION];
10397                     }
10398                 }
10399             }
10400
10401             # If is like '<+circled> 0052 <-circled>', convert to
10402             # '<circled> 0052'
10403             $fields[$PERL_DECOMPOSITION] =~
10404                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10405
10406             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10407             $fields[$PERL_DECOMPOSITION] =~
10408                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10409             or $fields[$PERL_DECOMPOSITION] =~
10410                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10411             or $fields[$PERL_DECOMPOSITION] =~
10412                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10413             or $fields[$PERL_DECOMPOSITION] =~
10414                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10415
10416             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10417             $fields[$PERL_DECOMPOSITION] =~
10418                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10419
10420             # Change names to modern form.
10421             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10422             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10423             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10424             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10425
10426             # One entry has weird braces
10427             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10428         }
10429
10430         $_ = join ';', $code_point, @fields;
10431         trace $_ if main::DEBUG && $to_trace;
10432         return;
10433     }
10434
10435     sub filter_v2_1_5_ucd {
10436         # A dozen entries in this 2.1.5 file had the mirrored and numeric
10437         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
10438         # column appears to be N, swap it back.
10439
10440         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10441         if ($fields[$NUMERIC] eq 'N') {
10442             $fields[$NUMERIC] = $fields[$MIRRORED];
10443             $fields[$MIRRORED] = 'N';
10444             $_ = join ';', $code_point, @fields;
10445         }
10446         return;
10447     }
10448
10449     sub filter_v6_ucd {
10450
10451         # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't
10452         # accepted that yet to allow for some deprecation cycles.
10453
10454         return if $_ !~ /^(?:0007|1F514|070F);/;
10455
10456         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10457         if ($code_point eq '0007') {
10458             $fields[$CHARNAME] = "ALERT";
10459         }
10460         elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10461                             # http://www.unicode.org/versions/corrigendum8.html
10462             $fields[$BIDI] = "AL";
10463         }
10464         elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
10465             $fields[$CHARNAME] = "";
10466         }
10467
10468         $_ = join ';', $code_point, @fields;
10469
10470         return;
10471     }
10472 } # End closure for UnicodeData
10473
10474 sub process_GCB_test {
10475
10476     my $file = shift;
10477     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10478
10479     while ($file->next_line) {
10480         push @backslash_X_tests, $_;
10481     }
10482
10483     return;
10484 }
10485
10486 sub process_NamedSequences {
10487     # NamedSequences.txt entries are just added to an array.  Because these
10488     # don't look like the other tables, they have their own handler.
10489     # An example:
10490     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10491     #
10492     # This just adds the sequence to an array for later handling
10493
10494     my $file = shift;
10495     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10496
10497     while ($file->next_line) {
10498         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10499         if (@remainder) {
10500             $file->carp_bad_line(
10501                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10502             next;
10503         }
10504
10505         # Note single \t in keeping with special output format of
10506         # Perl_charnames.  But it turns out that the code points don't have to
10507         # be 5 digits long, like the rest, based on the internal workings of
10508         # charnames.pm.  This could be easily changed for consistency.
10509         push @named_sequences, "$sequence\t$name";
10510     }
10511     return;
10512 }
10513
10514 { # Closure
10515
10516     my $first_range;
10517
10518     sub  filter_early_ea_lb {
10519         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
10520         # third field be the name of the code point, which can be ignored in
10521         # most cases.  But it can be meaningful if it marks a range:
10522         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10523         # 3400;W;<CJK Ideograph Extension A, First>
10524         #
10525         # We need to see the First in the example above to know it's a range.
10526         # They did not use the later range syntaxes.  This routine changes it
10527         # to use the modern syntax.
10528         # $1 is the Input_file object.
10529
10530         my @fields = split /\s*;\s*/;
10531         if ($fields[2] =~ /^<.*, First>/) {
10532             $first_range = $fields[0];
10533             $_ = "";
10534         }
10535         elsif ($fields[2] =~ /^<.*, Last>/) {
10536             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10537         }
10538         else {
10539             undef $first_range;
10540             $_ = "$fields[0]; $fields[1]";
10541         }
10542
10543         return;
10544     }
10545 }
10546
10547 sub filter_old_style_arabic_shaping {
10548     # Early versions used a different term for the later one.
10549
10550     my @fields = split /\s*;\s*/;
10551     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10552     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
10553     $_ = join ';', @fields;
10554     return;
10555 }
10556
10557 sub filter_arabic_shaping_line {
10558     # ArabicShaping.txt has entries that look like:
10559     # 062A; TEH; D; BEH
10560     # The field containing 'TEH' is not used.  The next field is Joining_Type
10561     # and the last is Joining_Group
10562     # This generates two lines to pass on, one for each property on the input
10563     # line.
10564
10565     my $file = shift;
10566     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10567
10568     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10569
10570     if (@fields > 4) {
10571         $file->carp_bad_line('Extra fields');
10572         $_ = "";
10573         return;
10574     }
10575
10576     $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10577     $_ = "$fields[0]; Joining_Type; $fields[2]";
10578
10579     return;
10580 }
10581
10582 { # Closure
10583     my $lc; # Table for lowercase mapping
10584     my $tc;
10585     my $uc;
10586
10587     sub setup_special_casing {
10588         # SpecialCasing.txt contains the non-simple case change mappings.  The
10589         # simple ones are in UnicodeData.txt, which should already have been
10590         # read in to the full property data structures, so as to initialize
10591         # these with the simple ones.  Then the SpecialCasing.txt entries
10592         # overwrite the ones which have different full mappings.
10593
10594         # This routine sees if the simple mappings are to be output, and if
10595         # so, copies what has already been put into the full mapping tables,
10596         # while they still contain only the simple mappings.
10597
10598         # The reason it is done this way is that the simple mappings are
10599         # probably not going to be output, so it saves work to initialize the
10600         # full tables with the simple mappings, and then overwrite those
10601         # relatively few entries in them that have different full mappings,
10602         # and thus skip the simple mapping tables altogether.
10603
10604         # New tables with just the simple mappings that are overridden by the
10605         # full ones are constructed.  These are for Unicode::UCD, which
10606         # requires the simple mappings.  The Case_Folding table is a combined
10607         # table of both the simple and full mappings, with the full ones being
10608         # in the hash, and the simple ones, even those overridden by the hash,
10609         # being in the base table.  That same mechanism could have been
10610         # employed here, except that the docs have said that the generated
10611         # files are usuable directly by programs, so we dare not change the
10612         # format in any way.
10613
10614         my $file= shift;
10615         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10616
10617         $lc = property_ref('lc');
10618         $tc = property_ref('tc');
10619         $uc = property_ref('uc');
10620
10621         # For each of the case change mappings...
10622         foreach my $case_table ($lc, $tc, $uc) {
10623             my $case = $case_table->name;
10624             my $full = property_ref($case);
10625             unless (defined $full && ! $full->is_empty) {
10626                 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
10627             }
10628
10629             # The simple version's name in each mapping merely has an 's' in
10630             # front of the full one's
10631             my $simple = property_ref('s' . $case);
10632             $simple->initialize($full) if $simple->to_output_map();
10633
10634             my $simple_only = Property->new("_s$case",
10635                     Type => $STRING,
10636                     Default_Map => $CODE_POINT,
10637                     Perl_Extension => 1,
10638                     Description => "The simple mappings for $case for code points that have full mappings as well");
10639             $simple_only->set_to_output_map($INTERNAL_MAP);
10640             $simple_only->add_comment(join_lines( <<END
10641 This file is for UCD.pm so that it can construct simple mappings that would
10642 otherwise be lost because they are overridden by full mappings.
10643 END
10644             ));
10645         }
10646
10647         return;
10648     }
10649
10650     sub filter_special_casing_line {
10651         # Change the format of $_ from SpecialCasing.txt into something that
10652         # the generic handler understands.  Each input line contains three
10653         # case mappings.  This will generate three lines to pass to the
10654         # generic handler for each of those.
10655
10656         # The input syntax (after stripping comments and trailing white space
10657         # is like one of the following (with the final two being entries that
10658         # we ignore):
10659         # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10660         # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10661         # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10662         # Note the trailing semi-colon, unlike many of the input files.  That
10663         # means that there will be an extra null field generated by the split
10664
10665         my $file = shift;
10666         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10667
10668         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
10669                                               # fields
10670
10671         # field #4 is when this mapping is conditional.  If any of these get
10672         # implemented, it would be by hard-coding in the casing functions in
10673         # the Perl core, not through tables.  But if there is a new condition
10674         # we don't know about, output a warning.  We know about all the
10675         # conditions through 6.0
10676         if ($fields[4] ne "") {
10677             my @conditions = split ' ', $fields[4];
10678             if ($conditions[0] ne 'tr'  # We know that these languages have
10679                                         # conditions, and some are multiple
10680                 && $conditions[0] ne 'az'
10681                 && $conditions[0] ne 'lt'
10682
10683                 # And, we know about a single condition Final_Sigma, but
10684                 # nothing else.
10685                 && ($v_version gt v5.2.0
10686                     && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10687             {
10688                 $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10689             }
10690             elsif ($conditions[0] ne 'Final_Sigma') {
10691
10692                     # Don't print out a message for Final_Sigma, because we
10693                     # have hard-coded handling for it.  (But the standard
10694                     # could change what the rule should be, but it wouldn't
10695                     # show up here anyway.
10696
10697                     print "# SKIPPING Special Casing: $_\n"
10698                                                     if $verbosity >= $VERBOSE;
10699             }
10700             $_ = "";
10701             return;
10702         }
10703         elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10704             $file->carp_bad_line('Extra fields');
10705             $_ = "";
10706             return;
10707         }
10708
10709         $_ = "$fields[0]; lc; $fields[1]";
10710         $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
10711         $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
10712
10713         # Copy any simple case change to the special tables constructed if
10714         # being overridden by a multi-character case change.
10715         if ($fields[1] ne $fields[0]
10716             && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
10717         {
10718             $file->insert_adjusted_lines("$fields[0]; _slc; $value");
10719         }
10720         if ($fields[2] ne $fields[0]
10721             && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
10722         {
10723             $file->insert_adjusted_lines("$fields[0]; _stc; $value");
10724         }
10725         if ($fields[3] ne $fields[0]
10726             && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
10727         {
10728             $file->insert_adjusted_lines("$fields[0]; _suc; $value");
10729         }
10730
10731         return;
10732     }
10733 }
10734
10735 sub filter_old_style_case_folding {
10736     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
10737     # and later style.  Different letters were used in the earlier.
10738
10739     my $file = shift;
10740     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10741
10742     my @fields = split /\s*;\s*/;
10743     if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10744         $fields[1] = 'I';
10745     }
10746     elsif ($fields[1] eq 'L') {
10747         $fields[1] = 'C';             # L => C always
10748     }
10749     elsif ($fields[1] eq 'E') {
10750         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
10751             $fields[1] = 'F'
10752         }
10753         else {
10754             $fields[1] = 'C'
10755         }
10756     }
10757     else {
10758         $file->carp_bad_line("Expecting L or E in second field");
10759         $_ = "";
10760         return;
10761     }
10762     $_ = join("; ", @fields) . ';';
10763     return;
10764 }
10765
10766 { # Closure for case folding
10767
10768     # Create the map for simple only if are going to output it, for otherwise
10769     # it takes no part in anything we do.
10770     my $to_output_simple;
10771
10772     sub setup_case_folding($) {
10773         # Read in the case foldings in CaseFolding.txt.  This handles both
10774         # simple and full case folding.
10775
10776         $to_output_simple
10777                         = property_ref('Simple_Case_Folding')->to_output_map;
10778
10779         return;
10780     }
10781
10782     sub filter_case_folding_line {
10783         # Called for each line in CaseFolding.txt
10784         # Input lines look like:
10785         # 0041; C; 0061; # LATIN CAPITAL LETTER A
10786         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10787         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10788         #
10789         # 'C' means that folding is the same for both simple and full
10790         # 'F' that it is only for full folding
10791         # 'S' that it is only for simple folding
10792         # 'T' is locale-dependent, and ignored
10793         # 'I' is a type of 'F' used in some early releases.
10794         # Note the trailing semi-colon, unlike many of the input files.  That
10795         # means that there will be an extra null field generated by the split
10796         # below, which we ignore and hence is not an error.
10797
10798         my $file = shift;
10799         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10800
10801         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10802         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10803             $file->carp_bad_line('Extra fields');
10804             $_ = "";
10805             return;
10806         }
10807
10808         if ($type eq 'T') {   # Skip Turkic case folding, is locale dependent
10809             $_ = "";
10810             return;
10811         }
10812
10813         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
10814         # I are all full foldings; S is single-char.  For S, there is always
10815         # an F entry, so we must allow multiple values for the same code
10816         # point.  Fortunately this table doesn't need further manipulation
10817         # which would preclude using multiple-values.  The S is now included
10818         # so that _swash_inversion_hash() is able to construct closures
10819         # without having to worry about F mappings.
10820         if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
10821             $_ = "$range; Case_Folding; $CMD_DELIM$REPLACE_CMD=$MULTIPLE$CMD_DELIM$map";
10822         }
10823         else {
10824             $_ = "";
10825             $file->carp_bad_line('Expecting C F I S or T in second field');
10826         }
10827
10828         # C and S are simple foldings, but simple case folding is not needed
10829         # unless we explicitly want its map table output.
10830         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10831             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10832         }
10833
10834         return;
10835     }
10836
10837 } # End case fold closure
10838
10839 sub filter_jamo_line {
10840     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
10841     # from this file that is used in generating the Name property for Jamo
10842     # code points.  But, it also is used to convert early versions' syntax
10843     # into the modern form.  Here are two examples:
10844     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
10845     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
10846     #
10847     # The input is $_, the output is $_ filtered.
10848
10849     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
10850
10851     # Let the caller handle unexpected input.  In earlier versions, there was
10852     # a third field which is supposed to be a comment, but did not have a '#'
10853     # before it.
10854     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10855
10856     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
10857                                 # beginning.
10858
10859     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
10860     $fields[1] = 'R' if $fields[0] eq '1105';
10861
10862     # Add to structure so can generate Names from it.
10863     my $cp = hex $fields[0];
10864     my $short_name = $fields[1];
10865     $Jamo{$cp} = $short_name;
10866     if ($cp <= $LBase + $LCount) {
10867         $Jamo_L{$short_name} = $cp - $LBase;
10868     }
10869     elsif ($cp <= $VBase + $VCount) {
10870         $Jamo_V{$short_name} = $cp - $VBase;
10871     }
10872     elsif ($cp <= $TBase + $TCount) {
10873         $Jamo_T{$short_name} = $cp - $TBase;
10874     }
10875     else {
10876         Carp::my_carp_bug("Unexpected Jamo code point in $_");
10877     }
10878
10879
10880     # Reassemble using just the first two fields to look like a typical
10881     # property file line
10882     $_ = "$fields[0]; $fields[1]";
10883
10884     return;
10885 }
10886
10887 sub register_fraction($) {
10888     # This registers the input rational number so that it can be passed on to
10889     # utf8_heavy.pl, both in rational and floating forms.
10890
10891     my $rational = shift;
10892
10893     my $float = eval $rational;
10894     $nv_floating_to_rational{$float} = $rational;
10895     return;
10896 }
10897
10898 sub filter_numeric_value_line {
10899     # DNumValues contains lines of a different syntax than the typical
10900     # property file:
10901     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
10902     #
10903     # This routine transforms $_ containing the anomalous syntax to the
10904     # typical, by filtering out the extra columns, and convert early version
10905     # decimal numbers to strings that look like rational numbers.
10906
10907     my $file = shift;
10908     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10909
10910     # Starting in 5.1, there is a rational field.  Just use that, omitting the
10911     # extra columns.  Otherwise convert the decimal number in the second field
10912     # to a rational, and omit extraneous columns.
10913     my @fields = split /\s*;\s*/, $_, -1;
10914     my $rational;
10915
10916     if ($v_version ge v5.1.0) {
10917         if (@fields != 4) {
10918             $file->carp_bad_line('Not 4 semi-colon separated fields');
10919             $_ = "";
10920             return;
10921         }
10922         $rational = $fields[3];
10923         $_ = join '; ', @fields[ 0, 3 ];
10924     }
10925     else {
10926
10927         # Here, is an older Unicode file, which has decimal numbers instead of
10928         # rationals in it.  Use the fraction to calculate the denominator and
10929         # convert to rational.
10930
10931         if (@fields != 2 && @fields != 3) {
10932             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10933             $_ = "";
10934             return;
10935         }
10936
10937         my $codepoints = $fields[0];
10938         my $decimal = $fields[1];
10939         if ($decimal =~ s/\.0+$//) {
10940
10941             # Anything ending with a decimal followed by nothing but 0's is an
10942             # integer
10943             $_ = "$codepoints; $decimal";
10944             $rational = $decimal;
10945         }
10946         else {
10947
10948             my $denominator;
10949             if ($decimal =~ /\.50*$/) {
10950                 $denominator = 2;
10951             }
10952
10953             # Here have the hardcoded repeating decimals in the fraction, and
10954             # the denominator they imply.  There were only a few denominators
10955             # in the older Unicode versions of this file which this code
10956             # handles, so it is easy to convert them.
10957
10958             # The 4 is because of a round-off error in the Unicode 3.2 files
10959             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10960                 $denominator = 3;
10961             }
10962             elsif ($decimal =~ /\.[27]50*$/) {
10963                 $denominator = 4;
10964             }
10965             elsif ($decimal =~ /\.[2468]0*$/) {
10966                 $denominator = 5;
10967             }
10968             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10969                 $denominator = 6;
10970             }
10971             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10972                 $denominator = 8;
10973             }
10974             if ($denominator) {
10975                 my $sign = ($decimal < 0) ? "-" : "";
10976                 my $numerator = int((abs($decimal) * $denominator) + .5);
10977                 $rational = "$sign$numerator/$denominator";
10978                 $_ = "$codepoints; $rational";
10979             }
10980             else {
10981                 $file->carp_bad_line("Can't cope with number '$decimal'.");
10982                 $_ = "";
10983                 return;
10984             }
10985         }
10986     }
10987
10988     register_fraction($rational) if $rational =~ qr{/};
10989     return;
10990 }
10991
10992 { # Closure
10993     my %unihan_properties;
10994     my $iicore;
10995
10996
10997     sub setup_unihan {
10998         # Do any special setup for Unihan properties.
10999
11000         # This property gives the wrong computed type, so override.
11001         my $usource = property_ref('kIRG_USource');
11002         $usource->set_type($STRING) if defined $usource;
11003
11004         # This property is to be considered binary (it says so in
11005         # http://www.unicode.org/reports/tr38/)
11006         $iicore = property_ref('kIICore');
11007         if (defined $iicore) {
11008             $iicore->set_type($BINARY);
11009
11010             # We have to change the default map, because the @missing line is
11011             # misleading, given that we are treating it as binary.
11012             $iicore->set_default_map('N');
11013             $iicore->table("Y")
11014                 ->add_note("Converted to a binary property as per unicode.org UAX #38.");
11015         }
11016
11017         return;
11018     }
11019
11020     sub filter_unihan_line {
11021         # Change unihan db lines to look like the others in the db.  Here is
11022         # an input sample:
11023         #   U+341C        kCangjie        IEKN
11024
11025         # Tabs are used instead of semi-colons to separate fields; therefore
11026         # they may have semi-colons embedded in them.  Change these to periods
11027         # so won't screw up the rest of the code.
11028         s/;/./g;
11029
11030         # Remove lines that don't look like ones we accept.
11031         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11032             $_ = "";
11033             return;
11034         }
11035
11036         # Extract the property, and save a reference to its object.
11037         my $property = $1;
11038         if (! exists $unihan_properties{$property}) {
11039             $unihan_properties{$property} = property_ref($property);
11040         }
11041
11042         # Don't do anything unless the property is one we're handling, which
11043         # we determine by seeing if there is an object defined for it or not
11044         if (! defined $unihan_properties{$property}) {
11045             $_ = "";
11046             return;
11047         }
11048
11049         # The iicore property is supposed to be a boolean, so convert to our
11050         # standard boolean form.
11051         if (defined $iicore && $unihan_properties{$property} == $iicore) {
11052             $_ =~ s/$property.*/$property\tY/
11053         }
11054
11055         # Convert the tab separators to our standard semi-colons, and convert
11056         # the U+HHHH notation to the rest of the standard's HHHH
11057         s/\t/;/g;
11058         s/\b U \+ (?= $code_point_re )//xg;
11059
11060         #local $to_trace = 1 if main::DEBUG;
11061         trace $_ if main::DEBUG && $to_trace;
11062
11063         return;
11064     }
11065 }
11066
11067 sub filter_blocks_lines {
11068     # In the Blocks.txt file, the names of the blocks don't quite match the
11069     # names given in PropertyValueAliases.txt, so this changes them so they
11070     # do match:  Blanks and hyphens are changed into underscores.  Also makes
11071     # early release versions look like later ones
11072     #
11073     # $_ is transformed to the correct value.
11074
11075     my $file = shift;
11076         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11077
11078     if ($v_version lt v3.2.0) {
11079         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11080             $_ = "";
11081             return;
11082         }
11083
11084         # Old versions used a different syntax to mark the range.
11085         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11086     }
11087
11088     my @fields = split /\s*;\s*/, $_, -1;
11089     if (@fields != 2) {
11090         $file->carp_bad_line("Expecting exactly two fields");
11091         $_ = "";
11092         return;
11093     }
11094
11095     # Change hyphens and blanks in the block name field only
11096     $fields[1] =~ s/[ -]/_/g;
11097     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
11098
11099     $_ = join("; ", @fields);
11100     return;
11101 }
11102
11103 { # Closure
11104     my $current_property;
11105
11106     sub filter_old_style_proplist {
11107         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
11108         # was in a completely different syntax.  Ken Whistler of Unicode says
11109         # that it was something he used as an aid for his own purposes, but
11110         # was never an official part of the standard.  However, comments in
11111         # DAge.txt indicate that non-character code points were available in
11112         # the UCD as of 3.1.  It is unclear to me (khw) how they could be
11113         # there except through this file (but on the other hand, they first
11114         # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
11115         # not.  But the claim is that it was published as an aid to others who
11116         # might want some more information than was given in the official UCD
11117         # of the time.  Many of the properties in it were incorporated into
11118         # the later PropList.txt, but some were not.  This program uses this
11119         # early file to generate property tables that are otherwise not
11120         # accessible in the early UCD's, and most were probably not really
11121         # official at that time, so one could argue that it should be ignored,
11122         # and you can easily modify things to skip this.  And there are bugs
11123         # in this file in various versions.  (For example, the 2.1.9 version
11124         # removes from Alphabetic the CJK range starting at 4E00, and they
11125         # weren't added back in until 3.1.0.)  Many of this file's properties
11126         # were later sanctioned, so this code generates tables for those
11127         # properties that aren't otherwise in the UCD of the time but
11128         # eventually did become official, and throws away the rest.  Here is a
11129         # list of all the ones that are thrown away:
11130         #   Bidi=*                       duplicates UnicodeData.txt
11131         #   Combining                    never made into official property;
11132         #                                is \P{ccc=0}
11133         #   Composite                    never made into official property.
11134         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
11135         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
11136         #   Delimiter                    never made into official property;
11137         #                                removed in 3.0.1
11138         #   Format Control               never made into official property;
11139         #                                similar to gc=cf
11140         #   High Surrogate               duplicates Blocks.txt
11141         #   Ignorable Control            never made into official property;
11142         #                                similar to di=y
11143         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
11144         #   Left of Pair                 never made into official property;
11145         #   Line Separator               duplicates UnicodeData.txt: gc=zl
11146         #   Low Surrogate                duplicates Blocks.txt
11147         #   Non-break                    was actually listed as a property
11148         #                                in 3.2, but without any code
11149         #                                points.  Unicode denies that this
11150         #                                was ever an official property
11151         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
11152         #   Numeric                      duplicates UnicodeData.txt: gc=cc
11153         #   Paired Punctuation           never made into official property;
11154         #                                appears to be gc=ps + gc=pe
11155         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
11156         #   Private Use                  duplicates UnicodeData.txt: gc=co
11157         #   Private Use High Surrogate   duplicates Blocks.txt
11158         #   Punctuation                  duplicates UnicodeData.txt: gc=p
11159         #   Space                        different definition than eventual
11160         #                                one.
11161         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
11162         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cc
11163         #   Zero-width                   never made into official property;
11164         #                                subset of gc=cf
11165         # Most of the properties have the same names in this file as in later
11166         # versions, but a couple do not.
11167         #
11168         # This subroutine filters $_, converting it from the old style into
11169         # the new style.  Here's a sample of the old-style
11170         #
11171         #   *******************************************
11172         #
11173         #   Property dump for: 0x100000A0 (Join Control)
11174         #
11175         #   200C..200D  (2 chars)
11176         #
11177         # In the example, the property is "Join Control".  It is kept in this
11178         # closure between calls to the subroutine.  The numbers beginning with
11179         # 0x were internal to Ken's program that generated this file.
11180
11181         # If this line contains the property name, extract it.
11182         if (/^Property dump for: [^(]*\((.*)\)/) {
11183             $_ = $1;
11184
11185             # Convert white space to underscores.
11186             s/ /_/g;
11187
11188             # Convert the few properties that don't have the same name as
11189             # their modern counterparts
11190             s/Identifier_Part/ID_Continue/
11191             or s/Not_a_Character/NChar/;
11192
11193             # If the name matches an existing property, use it.
11194             if (defined property_ref($_)) {
11195                 trace "new property=", $_ if main::DEBUG && $to_trace;
11196                 $current_property = $_;
11197             }
11198             else {        # Otherwise discard it
11199                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11200                 undef $current_property;
11201             }
11202             $_ = "";    # The property is saved for the next lines of the
11203                         # file, but this defining line is of no further use,
11204                         # so clear it so that the caller won't process it
11205                         # further.
11206         }
11207         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11208
11209             # Here, the input line isn't a header defining a property for the
11210             # following section, and either we aren't in such a section, or
11211             # the line doesn't look like one that defines the code points in
11212             # such a section.  Ignore this line.
11213             $_ = "";
11214         }
11215         else {
11216
11217             # Here, we have a line defining the code points for the current
11218             # stashed property.  Anything starting with the first blank is
11219             # extraneous.  Otherwise, it should look like a normal range to
11220             # the caller.  Append the property name so that it looks just like
11221             # a modern PropList entry.
11222
11223             $_ =~ s/\s.*//;
11224             $_ .= "; $current_property";
11225         }
11226         trace $_ if main::DEBUG && $to_trace;
11227         return;
11228     }
11229 } # End closure for old style proplist
11230
11231 sub filter_old_style_normalization_lines {
11232     # For early releases of Unicode, the lines were like:
11233     #        74..2A76    ; NFKD_NO
11234     # For later releases this became:
11235     #        74..2A76    ; NFKD_QC; N
11236     # Filter $_ to look like those in later releases.
11237     # Similarly for MAYBEs
11238
11239     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11240
11241     # Also, the property FC_NFKC was abbreviated to FNC
11242     s/FNC/FC_NFKC/;
11243     return;
11244 }
11245
11246 sub setup_script_extensions {
11247     # The Script_Extensions property starts out with a clone of the Script
11248     # property.
11249
11250     my $sc = property_ref("Script");
11251     my $scx = Property->new("scx", Full_Name => "Script_Extensions",
11252                   Initialize => $sc,
11253                   Default_Map => $sc->default_map,
11254                   Pre_Declared_Maps => 0,
11255                   );
11256     $scx->add_comment(join_lines( <<END
11257 The values for code points that appear in one script are just the same as for
11258 the 'Script' property.  Likewise the values for those that appear in many
11259 scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
11260 values of code points that appear in a few scripts are a space separated list
11261 of those scripts.
11262 END
11263     ));
11264
11265     # Make the scx's tables and aliases for them the same as sc's
11266     foreach my $table ($sc->tables) {
11267         my $scx_table = $scx->add_match_table($table->name,
11268                                 Full_Name => $table->full_name);
11269         foreach my $alias ($table->aliases) {
11270             $scx_table->add_alias($alias->name);
11271         }
11272     }
11273 }
11274
11275 sub finish_Unicode() {
11276     # This routine should be called after all the Unicode files have been read
11277     # in.  It:
11278     # 1) Adds the mappings for code points missing from the files which have
11279     #    defaults specified for them.
11280     # 2) At this this point all mappings are known, so it computes the type of
11281     #    each property whose type hasn't been determined yet.
11282     # 3) Calculates all the regular expression match tables based on the
11283     #    mappings.
11284     # 3) Calculates and adds the tables which are defined by Unicode, but
11285     #    which aren't derived by them
11286
11287     # For each property, fill in any missing mappings, and calculate the re
11288     # match tables.  If a property has more than one missing mapping, the
11289     # default is a reference to a data structure, and requires data from other
11290     # properties to resolve.  The sort is used to cause these to be processed
11291     # last, after all the other properties have been calculated.
11292     # (Fortunately, the missing properties so far don't depend on each other.)
11293     foreach my $property
11294         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
11295         property_ref('*'))
11296     {
11297         # $perl has been defined, but isn't one of the Unicode properties that
11298         # need to be finished up.
11299         next if $property == $perl;
11300
11301         # Handle the properties that have more than one possible default
11302         if (ref $property->default_map) {
11303             my $default_map = $property->default_map;
11304
11305             # These properties have stored in the default_map:
11306             # One or more of:
11307             #   1)  A default map which applies to all code points in a
11308             #       certain class
11309             #   2)  an expression which will evaluate to the list of code
11310             #       points in that class
11311             # And
11312             #   3) the default map which applies to every other missing code
11313             #      point.
11314             #
11315             # Go through each list.
11316             while (my ($default, $eval) = $default_map->get_next_defaults) {
11317
11318                 # Get the class list, and intersect it with all the so-far
11319                 # unspecified code points yielding all the code points
11320                 # in the class that haven't been specified.
11321                 my $list = eval $eval;
11322                 if ($@) {
11323                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
11324                     last;
11325                 }
11326
11327                 # Narrow down the list to just those code points we don't have
11328                 # maps for yet.
11329                 $list = $list & $property->inverse_list;
11330
11331                 # Add mappings to the property for each code point in the list
11332                 foreach my $range ($list->ranges) {
11333                     $property->add_map($range->start, $range->end, $default,
11334                     Replace => $CROAK);
11335                 }
11336             }
11337
11338             # All remaining code points have the other mapping.  Set that up
11339             # so the normal single-default mapping code will work on them
11340             $property->set_default_map($default_map->other_default);
11341
11342             # And fall through to do that
11343         }
11344
11345         # We should have enough data now to compute the type of the property.
11346         $property->compute_type;
11347         my $property_type = $property->type;
11348
11349         next if ! $property->to_create_match_tables;
11350
11351         # Here want to create match tables for this property
11352
11353         # The Unicode db always (so far, and they claim into the future) have
11354         # the default for missing entries in binary properties be 'N' (unless
11355         # there is a '@missing' line that specifies otherwise)
11356         if ($property_type == $BINARY && ! defined $property->default_map) {
11357             $property->set_default_map('N');
11358         }
11359
11360         # Add any remaining code points to the mapping, using the default for
11361         # missing code points.
11362         if (defined (my $default_map = $property->default_map)) {
11363
11364             # Make sure there is a match table for the default
11365             my $default_table;
11366             if (! defined ($default_table = $property->table($default_map))) {
11367                 $default_table = $property->add_match_table($default_map);
11368             }
11369
11370             # And, if the property is binary, the default table will just
11371             # be the complement of the other table.
11372             if ($property_type == $BINARY) {
11373                 my $non_default_table;
11374
11375                 # Find the non-default table.
11376                 for my $table ($property->tables) {
11377                     next if $table == $default_table;
11378                     $non_default_table = $table;
11379                 }
11380                 $default_table->set_complement($non_default_table);
11381             }
11382
11383             # This fills in any missing values with the default.  It's
11384             # tempting to save some time and memory in running this program
11385             # by skipping this step for binary tables where the default
11386             # is easily calculated.  But it is needed for generating
11387             # the test file, and other changes would also be required to do
11388             # so.
11389             $property->add_map(0, $LAST_UNICODE_CODEPOINT,
11390                                $default_map, Replace => $NO);
11391         }
11392
11393         # Have all we need to populate the match tables.
11394         my $property_name = $property->name;
11395         my $maps_should_be_defined = $property->pre_declared_maps;
11396         foreach my $range ($property->ranges) {
11397             my $map = $range->value;
11398             my $table = property_ref($property_name)->table($map);
11399             if (! defined $table) {
11400
11401                 # Integral and rational property values are not necessarily
11402                 # defined in PropValueAliases, but whether all the other ones
11403                 # should be depends on the property.
11404                 if ($maps_should_be_defined
11405                     && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
11406                 {
11407                     Carp::my_carp("Table '$property_name=$map' should have been defined.  Defining it now.")
11408                 }
11409                 $table = property_ref($property_name)->add_match_table($map);
11410             }
11411
11412             $table->add_range($range->start, $range->end);
11413         }
11414
11415         # For Perl 5.6 compatibility, all properties matchable in regexes can
11416         # have an optional 'Is_' prefix.  This is now done in utf8_heavy.pl.
11417         # But warn if this creates a conflict with a (new) Unicode property
11418         # name, although it appears that Unicode has made a decision never to
11419         # begin a property name with 'Is_', so this shouldn't happen.
11420         foreach my $alias ($property->aliases) {
11421             my $Is_name = 'Is_' . $alias->name;
11422             if (defined (my $pre_existing = property_ref($Is_name))) {
11423                 Carp::my_carp(<<END
11424 There is already an alias named $Is_name (from " . $pre_existing . "), so
11425 creating one for $property won't work.  This is bad news.  If it is not too
11426 late, get Unicode to back off.  Otherwise go back to the old scheme (findable
11427 from the git blame log for this area of the code that suppressed individual
11428 aliases that conflict with the new Unicode names.  Proceeding anyway.
11429 END
11430                 );
11431             }
11432         } # End of loop through aliases for this property
11433     } # End of loop through all Unicode properties.
11434
11435     # Fill in the mappings that Unicode doesn't completely furnish.  First the
11436     # single letter major general categories.  If Unicode were to start
11437     # delivering the values, this would be redundant, but better that than to
11438     # try to figure out if should skip and not get it right.  Ths could happen
11439     # if a new major category were to be introduced, and the hard-coded test
11440     # wouldn't know about it.
11441     # This routine depends on the standard names for the general categories
11442     # being what it thinks they are, like 'Cn'.  The major categories are the
11443     # union of all the general category tables which have the same first
11444     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11445     foreach my $minor_table ($gc->tables) {
11446         my $minor_name = $minor_table->name;
11447         next if length $minor_name == 1;
11448         if (length $minor_name != 2) {
11449             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
11450             next;
11451         }
11452
11453         my $major_name = uc(substr($minor_name, 0, 1));
11454         my $major_table = $gc->table($major_name);
11455         $major_table += $minor_table;
11456     }
11457
11458     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
11459     # defines it as LC)
11460     my $LC = $gc->table('LC');
11461     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
11462     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
11463
11464
11465     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11466                          # deliver the correct values in it
11467         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11468
11469         # Lt not in release 1.
11470         if (defined $gc->table('Lt')) {
11471             $LC += $gc->table('Lt');
11472             $gc->table('Lt')->set_caseless_equivalent($LC);
11473         }
11474     }
11475     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11476
11477     $gc->table('Ll')->set_caseless_equivalent($LC);
11478     $gc->table('Lu')->set_caseless_equivalent($LC);
11479
11480     my $Cs = $gc->table('Cs');
11481
11482
11483     # Folding information was introduced later into Unicode data.  To get
11484     # Perl's case ignore (/i) to work at all in releases that don't have
11485     # folding, use the best available alternative, which is lower casing.
11486     my $fold = property_ref('Simple_Case_Folding');
11487     if ($fold->is_empty) {
11488         $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11489         $fold->add_note(join_lines(<<END
11490 WARNING: This table uses lower case as a substitute for missing fold
11491 information
11492 END
11493         ));
11494     }
11495
11496     # Multiple-character mapping was introduced later into Unicode data.  If
11497     # missing, use the single-characters maps as best available alternative
11498     foreach my $map (qw {   Uppercase_Mapping
11499                             Lowercase_Mapping
11500                             Titlecase_Mapping
11501                             Case_Folding
11502                         } ) {
11503         my $full = property_ref($map);
11504         if ($full->is_empty) {
11505             my $simple = property_ref('Simple_' . $map);
11506             $full->initialize($simple);
11507             $full->add_comment($simple->comment) if ($simple->comment);
11508             $full->add_note(join_lines(<<END
11509 WARNING: This table uses simple mapping (single-character only) as a
11510 substitute for missing multiple-character information
11511 END
11512             ));
11513         }
11514     }
11515
11516     # The Script_Extensions property started out as a clone of the Script
11517     # property.  But processing its data file caused some elements to be
11518     # replaced with different data.  (These elements were for the Common and
11519     # Inherited properties.)  This data is a qw() list of all the scripts that
11520     # the code points in the given range are in.  An example line is:
11521     # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
11522     #
11523     # The code above has created a new match table named "Arab Syrc Thaa"
11524     # which contains 060C.  (The cloned table started out with this code point
11525     # mapping to "Common".)  Now we add 060C to each of the Arab, Syrc, and
11526     # Thaa match tables.  Then we delete the now spurious "Arab Syrc Thaa"
11527     # match table.  This is repeated for all these tables and ranges.  The map
11528     # data is retained in the map table for reference, but the spurious match
11529     # tables are deleted.
11530
11531     my $scx = property_ref("Script_Extensions");
11532     foreach my $table ($scx->tables) {
11533         next unless $table->name =~ /\s/;   # All the new and only the new
11534                                             # tables have a space in their
11535                                             # names
11536         my @scripts = split /\s+/, $table->name;
11537         foreach my $script (@scripts) {
11538             my $script_table = $scx->table($script);
11539             $script_table += $table;
11540         }
11541         $scx->delete_match_table($table);
11542     }
11543
11544     return;
11545 }
11546
11547 sub compile_perl() {
11548     # Create perl-defined tables.  Almost all are part of the pseudo-property
11549     # named 'perl' internally to this program.  Many of these are recommended
11550     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
11551     # on those found there.
11552     # Almost all of these are equivalent to some Unicode property.
11553     # A number of these properties have equivalents restricted to the ASCII
11554     # range, with their names prefaced by 'Posix', to signify that these match
11555     # what the Posix standard says they should match.  A couple are
11556     # effectively this, but the name doesn't have 'Posix' in it because there
11557     # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
11558     # to the full Unicode range, by our guesses as to what is appropriate.
11559
11560     # 'Any' is all code points.  As an error check, instead of just setting it
11561     # to be that, construct it to be the union of all the major categories
11562     $Any = $perl->add_match_table('Any',
11563             Description  => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
11564             Matches_All => 1);
11565
11566     foreach my $major_table ($gc->tables) {
11567
11568         # Major categories are the ones with single letter names.
11569         next if length($major_table->name) != 1;
11570
11571         $Any += $major_table;
11572     }
11573
11574     if ($Any->max != $LAST_UNICODE_CODEPOINT) {
11575         Carp::my_carp_bug("Generated highest code point ("
11576            . sprintf("%X", $Any->max)
11577            . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
11578     }
11579     if ($Any->range_count != 1 || $Any->min != 0) {
11580      Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
11581     }
11582
11583     $Any->add_alias('All');
11584
11585     # Assigned is the opposite of gc=unassigned
11586     my $Assigned = $perl->add_match_table('Assigned',
11587                                 Description  => "All assigned code points",
11588                                 Initialize => ~ $gc->table('Unassigned'),
11589                                 );
11590
11591     # Our internal-only property should be treated as more than just a
11592     # synonym.
11593     $perl->add_match_table('_CombAbove')
11594             ->set_equivalent_to(property_ref('ccc')->table('Above'),
11595                                                                 Related => 1);
11596
11597     my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
11598     if (defined $block) {   # This is equivalent to the block if have it.
11599         my $Unicode_ASCII = $block->table('Basic_Latin');
11600         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
11601             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
11602         }
11603     }
11604
11605     # Very early releases didn't have blocks, so initialize ASCII ourselves if
11606     # necessary
11607     if ($ASCII->is_empty) {
11608         $ASCII->initialize([ 0..127 ]);
11609     }
11610
11611     # Get the best available case definitions.  Early Unicode versions didn't
11612     # have Uppercase and Lowercase defined, so use the general category
11613     # instead for them.
11614     my $Lower = $perl->add_match_table('Lower');
11615     my $Unicode_Lower = property_ref('Lowercase');
11616     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
11617         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
11618         $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11619         $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11620         $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11621
11622     }
11623     else {
11624         $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
11625                                                                 Related => 1);
11626     }
11627     $Lower->add_alias('XPosixLower');
11628     my $Posix_Lower = $perl->add_match_table("PosixLower",
11629                             Description => "[a-z]",
11630                             Initialize => $Lower & $ASCII,
11631                             );
11632
11633     my $Upper = $perl->add_match_table('Upper');
11634     my $Unicode_Upper = property_ref('Uppercase');
11635     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
11636         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
11637         $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11638         $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11639         $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11640     }
11641     else {
11642         $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
11643                                                                 Related => 1);
11644     }
11645     $Upper->add_alias('XPosixUpper');
11646     my $Posix_Upper = $perl->add_match_table("PosixUpper",
11647                             Description => "[A-Z]",
11648                             Initialize => $Upper & $ASCII,
11649                             );
11650
11651     # Earliest releases didn't have title case.  Initialize it to empty if not
11652     # otherwise present
11653     my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
11654                                        Description => '(= \p{Gc=Lt})');
11655     my $lt = $gc->table('Lt');
11656
11657     # Earlier versions of mktables had this related to $lt since they have
11658     # identical code points, but their caseless equivalents are not the same,
11659     # one being 'Cased' and the other being 'LC', and so now must be kept as
11660     # separate entities.
11661     $Title += $lt if defined $lt;
11662
11663     # If this Unicode version doesn't have Cased, set up our own.  From
11664     # Unicode 5.1: Definition D120: A character C is defined to be cased if
11665     # and only if C has the Lowercase or Uppercase property or has a
11666     # General_Category value of Titlecase_Letter.
11667     my $Unicode_Cased = property_ref('Cased');
11668     unless (defined $Unicode_Cased) {
11669         my $cased = $perl->add_match_table('Cased',
11670                         Initialize => $Lower + $Upper + $Title,
11671                         Description => 'Uppercase or Lowercase or Titlecase',
11672                         );
11673         $Unicode_Cased = $cased;
11674     }
11675     $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
11676
11677     # Similarly, set up our own Case_Ignorable property if this Unicode
11678     # version doesn't have it.  From Unicode 5.1: Definition D121: A character
11679     # C is defined to be case-ignorable if C has the value MidLetter or the
11680     # value MidNumLet for the Word_Break property or its General_Category is
11681     # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
11682     # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11683
11684     # Perl has long had an internal-only alias for this property.
11685     my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
11686     my $case_ignorable = property_ref('Case_Ignorable');
11687     if (defined $case_ignorable && ! $case_ignorable->is_empty) {
11688         $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
11689                                                                 Related => 1);
11690     }
11691     else {
11692
11693         $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
11694
11695         # The following three properties are not in early releases
11696         $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
11697         $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
11698         $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
11699
11700         # For versions 4.1 - 5.0, there is no MidNumLet property, and
11701         # correspondingly the case-ignorable definition lacks that one.  For
11702         # 4.0, it appears that it was meant to be the same definition, but was
11703         # inadvertently omitted from the standard's text, so add it if the
11704         # property actually is there
11705         my $wb = property_ref('Word_Break');
11706         if (defined $wb) {
11707             my $midlet = $wb->table('MidLetter');
11708             $perl_case_ignorable += $midlet if defined $midlet;
11709             my $midnumlet = $wb->table('MidNumLet');
11710             $perl_case_ignorable += $midnumlet if defined $midnumlet;
11711         }
11712         else {
11713
11714             # In earlier versions of the standard, instead of the above two
11715             # properties , just the following characters were used:
11716             $perl_case_ignorable +=  0x0027  # APOSTROPHE
11717                                 +   0x00AD  # SOFT HYPHEN (SHY)
11718                                 +   0x2019; # RIGHT SINGLE QUOTATION MARK
11719         }
11720     }
11721
11722     # The remaining perl defined tables are mostly based on Unicode TR 18,
11723     # "Annex C: Compatibility Properties".  All of these have two versions,
11724     # one whose name generally begins with Posix that is posix-compliant, and
11725     # one that matches Unicode characters beyond the Posix, ASCII range
11726
11727     my $Alpha = $perl->add_match_table('Alpha');
11728
11729     # Alphabetic was not present in early releases
11730     my $Alphabetic = property_ref('Alphabetic');
11731     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
11732         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
11733     }
11734     else {
11735
11736         # For early releases, we don't get it exactly right.  The below
11737         # includes more than it should, which in 5.2 terms is: L + Nl +
11738         # Other_Alphabetic.  Other_Alphabetic contains many characters from
11739         # Mn and Mc.  It's better to match more than we should, than less than
11740         # we should.
11741         $Alpha->initialize($gc->table('Letter')
11742                             + $gc->table('Mn')
11743                             + $gc->table('Mc'));
11744         $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
11745         $Alpha->add_description('Alphabetic');
11746     }
11747     $Alpha->add_alias('XPosixAlpha');
11748     my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
11749                             Description => "[A-Za-z]",
11750                             Initialize => $Alpha & $ASCII,
11751                             );
11752     $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
11753     $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
11754
11755     my $Alnum = $perl->add_match_table('Alnum',
11756                         Description => 'Alphabetic and (Decimal) Numeric',
11757                         Initialize => $Alpha + $gc->table('Decimal_Number'),
11758                         );
11759     $Alnum->add_alias('XPosixAlnum');
11760     $perl->add_match_table("PosixAlnum",
11761                             Description => "[A-Za-z0-9]",
11762                             Initialize => $Alnum & $ASCII,
11763                             );
11764
11765     my $Word = $perl->add_match_table('Word',
11766                                 Description => '\w, including beyond ASCII;'
11767                                             . ' = \p{Alnum} + \pM + \p{Pc}',
11768                                 Initialize => $Alnum + $gc->table('Mark'),
11769                                 );
11770     $Word->add_alias('XPosixWord');
11771     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
11772     $Word += $Pc if defined $Pc;
11773
11774     # This is a Perl extension, so the name doesn't begin with Posix.
11775     my $PerlWord = $perl->add_match_table('PerlWord',
11776                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
11777                     Initialize => $Word & $ASCII,
11778                     );
11779     $PerlWord->add_alias('PosixWord');
11780
11781     my $Blank = $perl->add_match_table('Blank',
11782                                 Description => '\h, Horizontal white space',
11783
11784                                 # 200B is Zero Width Space which is for line
11785                                 # break control, and was listed as
11786                                 # Space_Separator in early releases
11787                                 Initialize => $gc->table('Space_Separator')
11788                                             +   0x0009  # TAB
11789                                             -   0x200B, # ZWSP
11790                                 );
11791     $Blank->add_alias('HorizSpace');        # Another name for it.
11792     $Blank->add_alias('XPosixBlank');
11793     $perl->add_match_table("PosixBlank",
11794                             Description => "\\t and ' '",
11795                             Initialize => $Blank & $ASCII,
11796                             );
11797
11798     my $VertSpace = $perl->add_match_table('VertSpace',
11799                             Description => '\v',
11800                             Initialize => $gc->table('Line_Separator')
11801                                         + $gc->table('Paragraph_Separator')
11802                                         + 0x000A  # LINE FEED
11803                                         + 0x000B  # VERTICAL TAB
11804                                         + 0x000C  # FORM FEED
11805                                         + 0x000D  # CARRIAGE RETURN
11806                                         + 0x0085, # NEL
11807                             );
11808     # No Posix equivalent for vertical space
11809
11810     my $Space = $perl->add_match_table('Space',
11811                 Description => '\s including beyond ASCII plus vertical tab',
11812                 Initialize => $Blank + $VertSpace,
11813     );
11814     $Space->add_alias('XPosixSpace');
11815     $perl->add_match_table("PosixSpace",
11816                             Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
11817                             Initialize => $Space & $ASCII,
11818                             );
11819
11820     # Perl's traditional space doesn't include Vertical Tab
11821     my $XPerlSpace = $perl->add_match_table('XPerlSpace',
11822                                   Description => '\s, including beyond ASCII',
11823                                   Initialize => $Space - 0x000B,
11824                                 );
11825     $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
11826     my $PerlSpace = $perl->add_match_table('PerlSpace',
11827                         Description => '\s, restricted to ASCII = [ \f\n\r\t]',
11828                         Initialize => $XPerlSpace & $ASCII,
11829                             );
11830
11831
11832     my $Cntrl = $perl->add_match_table('Cntrl',
11833                                         Description => 'Control characters');
11834     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
11835     $Cntrl->add_alias('XPosixCntrl');
11836     $perl->add_match_table("PosixCntrl",
11837                             Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
11838                             Initialize => $Cntrl & $ASCII,
11839                             );
11840
11841     # $controls is a temporary used to construct Graph.
11842     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
11843                                                 + $gc->table('Control'));
11844     # Cs not in release 1
11845     $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
11846
11847     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
11848     my $Graph = $perl->add_match_table('Graph',
11849                         Description => 'Characters that are graphical',
11850                         Initialize => ~ ($Space + $controls),
11851                         );
11852     $Graph->add_alias('XPosixGraph');
11853     $perl->add_match_table("PosixGraph",
11854                             Description =>
11855                                 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
11856                             Initialize => $Graph & $ASCII,
11857                             );
11858
11859     $print = $perl->add_match_table('Print',
11860                         Description => 'Characters that are graphical plus space characters (but no controls)',
11861                         Initialize => $Blank + $Graph - $gc->table('Control'),
11862                         );
11863     $print->add_alias('XPosixPrint');
11864     $perl->add_match_table("PosixPrint",
11865                             Description =>
11866                               '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
11867                             Initialize => $print & $ASCII,
11868                             );
11869
11870     my $Punct = $perl->add_match_table('Punct');
11871     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
11872
11873     # \p{punct} doesn't include the symbols, which posix does
11874     my $XPosixPunct = $perl->add_match_table('XPosixPunct',
11875                     Description => '\p{Punct} + ASCII-range \p{Symbol}',
11876                     Initialize => $gc->table('Punctuation')
11877                                 + ($ASCII & $gc->table('Symbol')),
11878         );
11879     $perl->add_match_table('PosixPunct',
11880         Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
11881         Initialize => $ASCII & $XPosixPunct,
11882         );
11883
11884     my $Digit = $perl->add_match_table('Digit',
11885                             Description => '[0-9] + all other decimal digits');
11886     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
11887     $Digit->add_alias('XPosixDigit');
11888     my $PosixDigit = $perl->add_match_table("PosixDigit",
11889                                             Description => '[0-9]',
11890                                             Initialize => $Digit & $ASCII,
11891                                             );
11892
11893     # Hex_Digit was not present in first release
11894     my $Xdigit = $perl->add_match_table('XDigit');
11895     $Xdigit->add_alias('XPosixXDigit');
11896     my $Hex = property_ref('Hex_Digit');
11897     if (defined $Hex && ! $Hex->is_empty) {
11898         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
11899     }
11900     else {
11901         # (Have to use hex instead of e.g. '0', because could be running on an
11902         # non-ASCII machine, and we want the Unicode (ASCII) values)
11903         $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
11904                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
11905         $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
11906     }
11907
11908     # AHex was not present in early releases
11909     my $PosixXDigit = $perl->add_match_table('PosixXDigit');
11910     my $AHex = property_ref('ASCII_Hex_Digit');
11911     if (defined $AHex && ! $AHex->is_empty) {
11912         $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
11913     }
11914     else {
11915         $PosixXDigit->initialize($Xdigit & $ASCII);
11916     }
11917     $PosixXDigit->add_description('[0-9A-Fa-f]');
11918
11919     my $dt = property_ref('Decomposition_Type');
11920     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
11921         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
11922         Perl_Extension => 1,
11923         Note => 'Union of all non-canonical decompositions',
11924         );
11925
11926     # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
11927     # than SD appeared, construct it ourselves, based on the first release SD
11928     # was in.
11929     my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
11930     my $soft_dotted = property_ref('Soft_Dotted');
11931     if (defined $soft_dotted && ! $soft_dotted->is_empty) {
11932         $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
11933     }
11934     else {
11935
11936         # This list came from 3.2 Soft_Dotted.
11937         $CanonDCIJ->initialize([ 0x0069,
11938                                  0x006A,
11939                                  0x012F,
11940                                  0x0268,
11941                                  0x0456,
11942                                  0x0458,
11943                                  0x1E2D,
11944                                  0x1ECB,
11945                                ]);
11946         $CanonDCIJ = $CanonDCIJ & $Assigned;
11947     }
11948
11949     # These are used in Unicode's definition of \X
11950     my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
11951     my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
11952
11953     # For backward compatibility, Perl has its own definition for IDStart
11954     # First, we include the underscore, and then the regular XID_Start also
11955     # have to be Words
11956     $perl->add_match_table('_Perl_IDStart',
11957                            Perl_Extension => 1,
11958                            Internal_Only => 1,
11959                            Initialize =>
11960                              ord('_')
11961                              + (property_ref('XID_Start')->table('Y') & $Word)
11962                            );
11963
11964     my $gcb = property_ref('Grapheme_Cluster_Break');
11965
11966     # The 'extended' grapheme cluster came in 5.1.  The non-extended
11967     # definition differs too much from the traditional Perl one to use.
11968     if (defined $gcb && defined $gcb->table('SpacingMark')) {
11969
11970         # Note that assumes HST is defined; it came in an earlier release than
11971         # GCB.  In the line below, two negatives means: yes hangul
11972         $begin += ~ property_ref('Hangul_Syllable_Type')
11973                                                     ->table('Not_Applicable')
11974                + ~ ($gcb->table('Control')
11975                     + $gcb->table('CR')
11976                     + $gcb->table('LF'));
11977         $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
11978
11979         $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
11980         $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
11981     }
11982     else {    # Old definition, used on early releases.
11983         $extend += $gc->table('Mark')
11984                 + 0x200C    # ZWNJ
11985                 + 0x200D;   # ZWJ
11986         $begin += ~ $extend;
11987
11988         # Here we may have a release that has the regular grapheme cluster
11989         # defined, or a release that doesn't have anything defined.
11990         # We set things up so the Perl core degrades gracefully, possibly with
11991         # placeholders that match nothing.
11992
11993         if (! defined $gcb) {
11994             $gcb = Property->new('GCB', Status => $PLACEHOLDER);
11995         }
11996         my $hst = property_ref('HST');
11997         if (!defined $hst) {
11998             $hst = Property->new('HST', Status => $PLACEHOLDER);
11999             $hst->add_match_table('Not_Applicable',
12000                                 Initialize => $Any,
12001                                 Matches_All => 1);
12002         }
12003
12004         # On some releases, here we may not have the needed tables for the
12005         # perl core, in some releases we may.
12006         foreach my $name (qw{ L LV LVT T V prepend }) {
12007             my $table = $gcb->table($name);
12008             if (! defined $table) {
12009                 $table = $gcb->add_match_table($name);
12010                 push @tables_that_may_be_empty, $table->complete_name;
12011             }
12012
12013             # The HST property predates the GCB one, and has identical tables
12014             # for some of them, so use it if we can.
12015             if ($table->is_empty
12016                 && defined $hst
12017                 && defined $hst->table($name))
12018             {
12019                 $table += $hst->table($name);
12020             }
12021         }
12022     }
12023
12024     # More GCB.  If we found some hangul syllables, populate a combined
12025     # table.
12026     my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
12027     my $LV = $gcb->table('LV');
12028     if ($LV->is_empty) {
12029         push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
12030     } else {
12031         $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
12032         $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
12033     }
12034
12035     # Was previously constructed to contain both Name and Unicode_1_Name
12036     my @composition = ('Name', 'Unicode_1_Name');
12037
12038     if (@named_sequences) {
12039         push @composition, 'Named_Sequence';
12040         foreach my $sequence (@named_sequences) {
12041             $perl_charname->add_anomalous_entry($sequence);
12042         }
12043     }
12044
12045     my $alias_sentence = "";
12046     my $alias = property_ref('Name_Alias');
12047     if (defined $alias) {
12048         push @composition, 'Name_Alias';
12049         $alias->reset_each_range;
12050         while (my ($range) = $alias->each_range) {
12051             next if $range->value eq "";
12052             if ($range->start != $range->end) {
12053                 Carp::my_carp("Expecting only one code point in the range $range.  Just to keep going, using just the first code point;");
12054             }
12055             $perl_charname->add_duplicate($range->start, $range->value);
12056         }
12057         $alias_sentence = <<END;
12058 The Name_Alias property adds duplicate code point entries with a corrected
12059 name.  The original (less correct, but still valid) name will be physically
12060 last.
12061 END
12062     }
12063     my $comment;
12064     if (@composition <= 2) { # Always at least 2
12065         $comment = join " and ", @composition;
12066     }
12067     else {
12068         $comment = join ", ", @composition[0 .. scalar @composition - 2];
12069         $comment .= ", and $composition[-1]";
12070     }
12071
12072     $perl_charname->add_comment(join_lines( <<END
12073 This file is for charnames.pm.  It is the union of the $comment properties.
12074 Unicode_1_Name entries are used only for otherwise nameless code
12075 points.
12076 $alias_sentence
12077 END
12078     ));
12079
12080     # Construct the Present_In property from the Age property.
12081     if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
12082         my $default_map = $age->default_map;
12083         my $in = Property->new('In',
12084                                 Default_Map => $default_map,
12085                                 Full_Name => "Present_In",
12086                                 Internal_Only_Warning => 1,
12087                                 Perl_Extension => 1,
12088                                 Type => $ENUM,
12089                                 Initialize => $age,
12090                                 );
12091         $in->add_comment(join_lines(<<END
12092 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
12093 same as for $age, and not for what $in really means.  This is because anything
12094 defined in a given release should have multiple values: that release and all
12095 higher ones.  But only one value per code point can be represented in a table
12096 like this.
12097 END
12098         ));
12099
12100         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
12101         # lowest numbered (earliest) come first, with the non-numeric one
12102         # last.
12103         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
12104                                             ? 1
12105                                             : ($b->name !~ /^[\d.]*$/)
12106                                                 ? -1
12107                                                 : $a->name <=> $b->name
12108                                             } $age->tables;
12109
12110         # The Present_In property is the cumulative age properties.  The first
12111         # one hence is identical to the first age one.
12112         my $previous_in = $in->add_match_table($first_age->name);
12113         $previous_in->set_equivalent_to($first_age, Related => 1);
12114
12115         my $description_start = "Code point's usage introduced in version ";
12116         $first_age->add_description($description_start . $first_age->name);
12117
12118         # To construct the accumulated values, for each of the age tables
12119         # starting with the 2nd earliest, merge the earliest with it, to get
12120         # all those code points existing in the 2nd earliest.  Repeat merging
12121         # the new 2nd earliest with the 3rd earliest to get all those existing
12122         # in the 3rd earliest, and so on.
12123         foreach my $current_age (@rest_ages) {
12124             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
12125
12126             my $current_in = $in->add_match_table(
12127                                     $current_age->name,
12128                                     Initialize => $current_age + $previous_in,
12129                                     Description => $description_start
12130                                                     . $current_age->name
12131                                                     . ' or earlier',
12132                                     );
12133             $previous_in = $current_in;
12134
12135             # Add clarifying material for the corresponding age file.  This is
12136             # in part because of the confusing and contradictory information
12137             # given in the Standard's documentation itself, as of 5.2.
12138             $current_age->add_description(
12139                             "Code point's usage was introduced in version "
12140                             . $current_age->name);
12141             $current_age->add_note("See also $in");
12142
12143         }
12144
12145         # And finally the code points whose usages have yet to be decided are
12146         # the same in both properties.  Note that permanently unassigned code
12147         # points actually have their usage assigned (as being permanently
12148         # unassigned), so that these tables are not the same as gc=cn.
12149         my $unassigned = $in->add_match_table($default_map);
12150         my $age_default = $age->table($default_map);
12151         $age_default->add_description(<<END
12152 Code point's usage has not been assigned in any Unicode release thus far.
12153 END
12154         );
12155         $unassigned->set_equivalent_to($age_default, Related => 1);
12156     }
12157
12158
12159     # Finished creating all the perl properties.  All non-internal non-string
12160     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
12161     # an underscore.)  These do not get a separate entry in the pod file
12162     foreach my $table ($perl->tables) {
12163         foreach my $alias ($table->aliases) {
12164             next if $alias->name =~ /^_/;
12165             $table->add_alias('Is_' . $alias->name,
12166                                Pod_Entry => 0,
12167                                Status => $alias->status,
12168                                Externally_Ok => 0);
12169         }
12170     }
12171
12172     # Here done with all the basic stuff.  Ready to populate the information
12173     # about each character if annotating them.
12174     if ($annotate) {
12175
12176         # See comments at its declaration
12177         $annotate_ranges = Range_Map->new;
12178
12179         # This separates out the non-characters from the other unassigneds, so
12180         # can give different annotations for each.
12181         $unassigned_sans_noncharacters = Range_List->new(
12182          Initialize => $gc->table('Unassigned')
12183                        & property_ref('Noncharacter_Code_Point')->table('N'));
12184
12185         for (my $i = 0; $i <= $LAST_UNICODE_CODEPOINT; $i++ ) {
12186             $i = populate_char_info($i);    # Note sets $i so may cause skips
12187         }
12188     }
12189
12190     return;
12191 }
12192
12193 sub add_perl_synonyms() {
12194     # A number of Unicode tables have Perl synonyms that are expressed in
12195     # the single-form, \p{name}.  These are:
12196     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
12197     #       \p{Is_Name} as synonyms
12198     #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
12199     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
12200     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
12201     #       conflict, \p{Value} and \p{Is_Value} as well
12202     #
12203     # This routine generates these synonyms, warning of any unexpected
12204     # conflicts.
12205
12206     # Construct the list of tables to get synonyms for.  Start with all the
12207     # binary and the General_Category ones.
12208     my @tables = grep { $_->type == $BINARY } property_ref('*');
12209     push @tables, $gc->tables;
12210
12211     # If the version of Unicode includes the Script property, add its tables
12212     if (defined property_ref('Script')) {
12213         push @tables, property_ref('Script')->tables;
12214     }
12215
12216     # The Block tables are kept separate because they are treated differently.
12217     # And the earliest versions of Unicode didn't include them, so add only if
12218     # there are some.
12219     my @blocks;
12220     push @blocks, $block->tables if defined $block;
12221
12222     # Here, have the lists of tables constructed.  Process blocks last so that
12223     # if there are name collisions with them, blocks have lowest priority.
12224     # Should there ever be other collisions, manual intervention would be
12225     # required.  See the comments at the beginning of the program for a
12226     # possible way to handle those semi-automatically.
12227     foreach my $table (@tables,  @blocks) {
12228
12229         # For non-binary properties, the synonym is just the name of the
12230         # table, like Greek, but for binary properties the synonym is the name
12231         # of the property, and means the code points in its 'Y' table.
12232         my $nominal = $table;
12233         my $nominal_property = $nominal->property;
12234         my $actual;
12235         if (! $nominal->isa('Property')) {
12236             $actual = $table;
12237         }
12238         else {
12239
12240             # Here is a binary property.  Use the 'Y' table.  Verify that is
12241             # there
12242             my $yes = $nominal->table('Y');
12243             unless (defined $yes) {  # Must be defined, but is permissible to
12244                                      # be empty.
12245                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
12246                 next;
12247             }
12248             $actual = $yes;
12249         }
12250
12251         foreach my $alias ($nominal->aliases) {
12252
12253             # Attempt to create a table in the perl directory for the
12254             # candidate table, using whatever aliases in it that don't
12255             # conflict.  Also add non-conflicting aliases for all these
12256             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
12257             PREFIX:
12258             foreach my $prefix ("", 'Is_', 'In_') {
12259
12260                 # Only Block properties can have added 'In_' aliases.
12261                 next if $prefix eq 'In_' and $nominal_property != $block;
12262
12263                 my $proposed_name = $prefix . $alias->name;
12264
12265                 # No Is_Is, In_In, nor combinations thereof
12266                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
12267                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
12268
12269                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
12270
12271                 # Get a reference to any existing table in the perl
12272                 # directory with the desired name.
12273                 my $pre_existing = $perl->table($proposed_name);
12274
12275                 if (! defined $pre_existing) {
12276
12277                     # No name collision, so ok to add the perl synonym.
12278
12279                     my $make_pod_entry;
12280                     my $externally_ok;
12281                     my $status = $alias->status;
12282                     if ($nominal_property == $block) {
12283
12284                         # For block properties, the 'In' form is preferred for
12285                         # external use; the pod file contains wild cards for
12286                         # this and the 'Is' form so no entries for those; and
12287                         # we don't want people using the name without the
12288                         # 'In', so discourage that.
12289                         if ($prefix eq "") {
12290                             $make_pod_entry = 1;
12291                             $status = $status || $DISCOURAGED;
12292                             $externally_ok = 0;
12293                         }
12294                         elsif ($prefix eq 'In_') {
12295                             $make_pod_entry = 0;
12296                             $status = $status || $NORMAL;
12297                             $externally_ok = 1;
12298                         }
12299                         else {
12300                             $make_pod_entry = 0;
12301                             $status = $status || $DISCOURAGED;
12302                             $externally_ok = 0;
12303                         }
12304                     }
12305                     elsif ($prefix ne "") {
12306
12307                         # The 'Is' prefix is handled in the pod by a wild
12308                         # card, and we won't use it for an external name
12309                         $make_pod_entry = 0;
12310                         $status = $status || $NORMAL;
12311                         $externally_ok = 0;
12312                     }
12313                     else {
12314
12315                         # Here, is an empty prefix, non block.  This gets its
12316                         # own pod entry and can be used for an external name.
12317                         $make_pod_entry = 1;
12318                         $status = $status || $NORMAL;
12319                         $externally_ok = 1;
12320                     }
12321
12322                     # Here, there isn't a perl pre-existing table with the
12323                     # name.  Look through the list of equivalents of this
12324                     # table to see if one is a perl table.
12325                     foreach my $equivalent ($actual->leader->equivalents) {
12326                         next if $equivalent->property != $perl;
12327
12328                         # Here, have found a table for $perl.  Add this alias
12329                         # to it, and are done with this prefix.
12330                         $equivalent->add_alias($proposed_name,
12331                                         Pod_Entry => $make_pod_entry,
12332                                         Status => $status,
12333                                         Externally_Ok => $externally_ok);
12334                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
12335                         next PREFIX;
12336                     }
12337
12338                     # Here, $perl doesn't already have a table that is a
12339                     # synonym for this property, add one.
12340                     my $added_table = $perl->add_match_table($proposed_name,
12341                                             Pod_Entry => $make_pod_entry,
12342                                             Status => $status,
12343                                             Externally_Ok => $externally_ok);
12344                     # And it will be related to the actual table, since it is
12345                     # based on it.
12346                     $added_table->set_equivalent_to($actual, Related => 1);
12347                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
12348                     next;
12349                 } # End of no pre-existing.
12350
12351                 # Here, there is a pre-existing table that has the proposed
12352                 # name.  We could be in trouble, but not if this is just a
12353                 # synonym for another table that we have already made a child
12354                 # of the pre-existing one.
12355                 if ($pre_existing->is_set_equivalent_to($actual)) {
12356                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
12357                     $pre_existing->add_alias($proposed_name);
12358                     next;
12359                 }
12360
12361                 # Here, there is a name collision, but it still could be ok if
12362                 # the tables match the identical set of code points, in which
12363                 # case, we can combine the names.  Compare each table's code
12364                 # point list to see if they are identical.
12365                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
12366                 if ($pre_existing->matches_identically_to($actual)) {
12367
12368                     # Here, they do match identically.  Not a real conflict.
12369                     # Make the perl version a child of the Unicode one, except
12370                     # in the non-obvious case of where the perl name is
12371                     # already a synonym of another Unicode property.  (This is
12372                     # excluded by the test for it being its own parent.)  The
12373                     # reason for this exclusion is that then the two Unicode
12374                     # properties become related; and we don't really know if
12375                     # they are or not.  We generate documentation based on
12376                     # relatedness, and this would be misleading.  Code
12377                     # later executed in the process will cause the tables to
12378                     # be represented by a single file anyway, without making
12379                     # it look in the pod like they are necessarily related.
12380                     if ($pre_existing->parent == $pre_existing
12381                         && ($pre_existing->property == $perl
12382                             || $actual->property == $perl))
12383                     {
12384                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
12385                         $pre_existing->set_equivalent_to($actual, Related => 1);
12386                     }
12387                     elsif (main::DEBUG && $to_trace) {
12388                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
12389                         trace $pre_existing->parent;
12390                     }
12391                     next PREFIX;
12392                 }
12393
12394                 # Here they didn't match identically, there is a real conflict
12395                 # between our new name and a pre-existing property.
12396                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
12397                 $pre_existing->add_conflicting($nominal->full_name,
12398                                                'p',
12399                                                $actual);
12400
12401                 # Don't output a warning for aliases for the block
12402                 # properties (unless they start with 'In_') as it is
12403                 # expected that there will be conflicts and the block
12404                 # form loses.
12405                 if ($verbosity >= $NORMAL_VERBOSITY
12406                     && ($actual->property != $block || $prefix eq 'In_'))
12407                 {
12408                     print simple_fold(join_lines(<<END
12409 There is already an alias named $proposed_name (from " . $pre_existing . "),
12410 so not creating this alias for " . $actual
12411 END
12412                     ), "", 4);
12413                 }
12414
12415                 # Keep track for documentation purposes.
12416                 $has_In_conflicts++ if $prefix eq 'In_';
12417                 $has_Is_conflicts++ if $prefix eq 'Is_';
12418             }
12419         }
12420     }
12421
12422     # There are some properties which have No and Yes (and N and Y) as
12423     # property values, but aren't binary, and could possibly be confused with
12424     # binary ones.  So create caveats for them.  There are tables that are
12425     # named 'No', and tables that are named 'N', but confusion is not likely
12426     # unless they are the same table.  For example, N meaning Number or
12427     # Neutral is not likely to cause confusion, so don't add caveats to things
12428     # like them.
12429     foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
12430         my $yes = $property->table('Yes');
12431         if (defined $yes) {
12432             my $y = $property->table('Y');
12433             if (defined $y && $yes == $y) {
12434                 foreach my $alias ($property->aliases) {
12435                     $yes->add_conflicting($alias->name);
12436                 }
12437             }
12438         }
12439         my $no = $property->table('No');
12440         if (defined $no) {
12441             my $n = $property->table('N');
12442             if (defined $n && $no == $n) {
12443                 foreach my $alias ($property->aliases) {
12444                     $no->add_conflicting($alias->name, 'P');
12445                 }
12446             }
12447         }
12448     }
12449
12450     return;
12451 }
12452
12453 sub register_file_for_name($$$) {
12454     # Given info about a table and a datafile that it should be associated
12455     # with, register that association
12456
12457     my $table = shift;
12458     my $directory_ref = shift;   # Array of the directory path for the file
12459     my $file = shift;            # The file name in the final directory.
12460     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12461
12462     trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
12463
12464     if ($table->isa('Property')) {
12465         $table->set_file_path(@$directory_ref, $file);
12466         push @map_properties, $table
12467                                     if $directory_ref->[0] eq $map_directory;
12468         return;
12469     }
12470
12471     # Do all of the work for all equivalent tables when called with the leader
12472     # table, so skip if isn't the leader.
12473     return if $table->leader != $table;
12474
12475     # If this is a complement of another file, use that other file instead,
12476     # with a ! prepended to it.
12477     my $complement;
12478     if (($complement = $table->complement) != 0) {
12479         my @directories = $complement->file_path;
12480
12481         # This assumes that the 0th element is something like 'lib',
12482         # the 1th element the property name (in its own directory), like
12483         # 'AHex', and the 2th element the file like 'Y' which will have a .pl
12484         # appended to it later.
12485         $directories[1] =~ s/^/!/;
12486         $file = pop @directories;
12487         $directory_ref =\@directories;
12488     }
12489
12490     # Join all the file path components together, using slashes.
12491     my $full_filename = join('/', @$directory_ref, $file);
12492
12493     # All go in the same subdirectory of unicore
12494     if ($directory_ref->[0] ne $matches_directory) {
12495         Carp::my_carp("Unexpected directory in "
12496                 .  join('/', @{$directory_ref}, $file));
12497     }
12498
12499     # For this table and all its equivalents ...
12500     foreach my $table ($table, $table->equivalents) {
12501
12502         # Associate it with its file internally.  Don't include the
12503         # $matches_directory first component
12504         $table->set_file_path(@$directory_ref, $file);
12505         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
12506
12507         my $property = $table->property;
12508         $property = ($property == $perl)
12509                     ? ""                # 'perl' is never explicitly stated
12510                     : standardize($property->name) . '=';
12511
12512         my $deprecated = ($table->status eq $DEPRECATED)
12513                          ? $table->status_info
12514                          : "";
12515         my $caseless_equivalent = $table->caseless_equivalent;
12516
12517         # And for each of the table's aliases...  This inner loop eventually
12518         # goes through all aliases in the UCD that we generate regex match
12519         # files for
12520         foreach my $alias ($table->aliases) {
12521             my $standard = utf8_heavy_name($table, $alias);
12522
12523             # Generate an entry in either the loose or strict hashes, which
12524             # will translate the property and alias names combination into the
12525             # file where the table for them is stored.
12526             if ($alias->loose_match) {
12527                 if (exists $loose_to_file_of{$standard}) {
12528                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
12529                 }
12530                 else {
12531                     $loose_to_file_of{$standard} = $sub_filename;
12532                 }
12533             }
12534             else {
12535                 if (exists $stricter_to_file_of{$standard}) {
12536                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
12537                 }
12538                 else {
12539                     $stricter_to_file_of{$standard} = $sub_filename;
12540
12541                     # Tightly coupled with how utf8_heavy.pl works, for a
12542                     # floating point number that is a whole number, get rid of
12543                     # the trailing decimal point and 0's, so that utf8_heavy
12544                     # will work.  Also note that this assumes that such a
12545                     # number is matched strictly; so if that were to change,
12546                     # this would be wrong.
12547                     if ((my $integer_name = $alias->name)
12548                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
12549                     {
12550                         $stricter_to_file_of{$property . $integer_name}
12551                                                             = $sub_filename;
12552                     }
12553                 }
12554             }
12555
12556             # Keep a list of the deprecated properties and their filenames
12557             if ($deprecated && $complement == 0) {
12558                 $utf8::why_deprecated{$sub_filename} = $deprecated;
12559             }
12560
12561             # And a substitute table, if any, for case-insensitive matching
12562             if ($caseless_equivalent != 0) {
12563                 $caseless_equivalent_to{$standard} = $caseless_equivalent;
12564             }
12565         }
12566     }
12567
12568     return;
12569 }
12570
12571 {   # Closure
12572     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
12573                      # conflicts
12574     my %full_dir_name_of;   # Full length names of directories used.
12575
12576     sub construct_filename($$$) {
12577         # Return a file name for a table, based on the table name, but perhaps
12578         # changed to get rid of non-portable characters in it, and to make
12579         # sure that it is unique on a file system that allows the names before
12580         # any period to be at most 8 characters (DOS).  While we're at it
12581         # check and complain if there are any directory conflicts.
12582
12583         my $name = shift;       # The name to start with
12584         my $mutable = shift;    # Boolean: can it be changed?  If no, but
12585                                 # yet it must be to work properly, a warning
12586                                 # is given
12587         my $directories_ref = shift;  # A reference to an array containing the
12588                                 # path to the file, with each element one path
12589                                 # component.  This is used because the same
12590                                 # name can be used in different directories.
12591         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12592
12593         my $warn = ! defined wantarray;  # If true, then if the name is
12594                                 # changed, a warning is issued as well.
12595
12596         if (! defined $name) {
12597             Carp::my_carp("Undefined name in directory "
12598                           . File::Spec->join(@$directories_ref)
12599                           . ". '_' used");
12600             return '_';
12601         }
12602
12603         # Make sure that no directory names conflict with each other.  Look at
12604         # each directory in the input file's path.  If it is already in use,
12605         # assume it is correct, and is merely being re-used, but if we
12606         # truncate it to 8 characters, and find that there are two directories
12607         # that are the same for the first 8 characters, but differ after that,
12608         # then that is a problem.
12609         foreach my $directory (@$directories_ref) {
12610             my $short_dir = substr($directory, 0, 8);
12611             if (defined $full_dir_name_of{$short_dir}) {
12612                 next if $full_dir_name_of{$short_dir} eq $directory;
12613                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
12614             }
12615             else {
12616                 $full_dir_name_of{$short_dir} = $directory;
12617             }
12618         }
12619
12620         my $path = join '/', @$directories_ref;
12621         $path .= '/' if $path;
12622
12623         # Remove interior underscores.
12624         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
12625
12626         # Change any non-word character into an underscore, and truncate to 8.
12627         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
12628         substr($filename, 8) = "" if length($filename) > 8;
12629
12630         # Make sure the basename doesn't conflict with something we
12631         # might have already written. If we have, say,
12632         #     InGreekExtended1
12633         #     InGreekExtended2
12634         # they become
12635         #     InGreekE
12636         #     InGreek2
12637         my $warned = 0;
12638         while (my $num = $base_names{$path}{lc $filename}++) {
12639             $num++; # so basenames with numbers start with '2', which
12640                     # just looks more natural.
12641
12642             # Want to append $num, but if it'll make the basename longer
12643             # than 8 characters, pre-truncate $filename so that the result
12644             # is acceptable.
12645             my $delta = length($filename) + length($num) - 8;
12646             if ($delta > 0) {
12647                 substr($filename, -$delta) = $num;
12648             }
12649             else {
12650                 $filename .= $num;
12651             }
12652             if ($warn && ! $warned) {
12653                 $warned = 1;
12654                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
12655             }
12656         }
12657
12658         return $filename if $mutable;
12659
12660         # If not changeable, must return the input name, but warn if needed to
12661         # change it beyond shortening it.
12662         if ($name ne $filename
12663             && substr($name, 0, length($filename)) ne $filename) {
12664             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
12665         }
12666         return $name;
12667     }
12668 }
12669
12670 # The pod file contains a very large table.  Many of the lines in that table
12671 # would exceed a typical output window's size, and so need to be wrapped with
12672 # a hanging indent to make them look good.  The pod language is really
12673 # insufficient here.  There is no general construct to do that in pod, so it
12674 # is done here by beginning each such line with a space to cause the result to
12675 # be output without formatting, and doing all the formatting here.  This leads
12676 # to the result that if the eventual display window is too narrow it won't
12677 # look good, and if the window is too wide, no advantage is taken of that
12678 # extra width.  A further complication is that the output may be indented by
12679 # the formatter so that there is less space than expected.  What I (khw) have
12680 # done is to assume that that indent is a particular number of spaces based on
12681 # what it is in my Linux system;  people can always resize their windows if
12682 # necessary, but this is obviously less than desirable, but the best that can
12683 # be expected.
12684 my $automatic_pod_indent = 8;
12685
12686 # Try to format so that uses fewest lines, but few long left column entries
12687 # slide into the right column.  An experiment on 5.1 data yielded the
12688 # following percentages that didn't cut into the other side along with the
12689 # associated first-column widths
12690 # 69% = 24
12691 # 80% not too bad except for a few blocks
12692 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
12693 # 95% = 37;
12694 my $indent_info_column = 27;    # 75% of lines didn't have overlap
12695
12696 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
12697                     # The 3 is because of:
12698                     #   1   for the leading space to tell the pod formatter to
12699                     #       output as-is
12700                     #   1   for the flag
12701                     #   1   for the space between the flag and the main data
12702
12703 sub format_pod_line ($$$;$$) {
12704     # Take a pod line and return it, formatted properly
12705
12706     my $first_column_width = shift;
12707     my $entry = shift;  # Contents of left column
12708     my $info = shift;   # Contents of right column
12709
12710     my $status = shift || "";   # Any flag
12711
12712     my $loose_match = shift;    # Boolean.
12713     $loose_match = 1 unless defined $loose_match;
12714
12715     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12716
12717     my $flags = "";
12718     $flags .= $STRICTER if ! $loose_match;
12719
12720     $flags .= $status if $status;
12721
12722     # There is a blank in the left column to cause the pod formatter to
12723     # output the line as-is.
12724     return sprintf " %-*s%-*s %s\n",
12725                     # The first * in the format is replaced by this, the -1 is
12726                     # to account for the leading blank.  There isn't a
12727                     # hard-coded blank after this to separate the flags from
12728                     # the rest of the line, so that in the unlikely event that
12729                     # multiple flags are shown on the same line, they both
12730                     # will get displayed at the expense of that separation,
12731                     # but since they are left justified, a blank will be
12732                     # inserted in the normal case.
12733                     $FILLER - 1,
12734                     $flags,
12735
12736                     # The other * in the format is replaced by this number to
12737                     # cause the first main column to right fill with blanks.
12738                     # The -1 is for the guaranteed blank following it.
12739                     $first_column_width - $FILLER - 1,
12740                     $entry,
12741                     $info;
12742 }
12743
12744 my @zero_match_tables;  # List of tables that have no matches in this release
12745
12746 sub make_table_pod_entries($) {
12747     # This generates the entries for the pod file for a given table.
12748     # Also done at this time are any children tables.  The output looks like:
12749     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
12750
12751     my $input_table = shift;        # Table the entry is for
12752     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12753
12754     # Generate parent and all its children at the same time.
12755     return if $input_table->parent != $input_table;
12756
12757     my $property = $input_table->property;
12758     my $type = $property->type;
12759     my $full_name = $property->full_name;
12760
12761     my $count = $input_table->count;
12762     my $string_count = clarify_number($count);
12763     my $status = $input_table->status;
12764     my $status_info = $input_table->status_info;
12765     my $caseless_equivalent = $input_table->caseless_equivalent;
12766
12767     my $entry_for_first_table; # The entry for the first table output.
12768                            # Almost certainly, it is the parent.
12769
12770     # For each related table (including itself), we will generate a pod entry
12771     # for each name each table goes by
12772     foreach my $table ($input_table, $input_table->children) {
12773
12774         # utf8_heavy.pl cannot deal with null string property values, so don't
12775         # output any.
12776         next if $table->name eq "";
12777
12778         # First, gather all the info that applies to this table as a whole.
12779
12780         push @zero_match_tables, $table if $count == 0;
12781
12782         my $table_property = $table->property;
12783
12784         # The short name has all the underscores removed, while the full name
12785         # retains them.  Later, we decide whether to output a short synonym
12786         # for the full one, we need to compare apples to apples, so we use the
12787         # short name's length including underscores.
12788         my $table_property_short_name_length;
12789         my $table_property_short_name
12790             = $table_property->short_name(\$table_property_short_name_length);
12791         my $table_property_full_name = $table_property->full_name;
12792
12793         # Get how much savings there is in the short name over the full one
12794         # (delta will always be <= 0)
12795         my $table_property_short_delta = $table_property_short_name_length
12796                                          - length($table_property_full_name);
12797         my @table_description = $table->description;
12798         my @table_note = $table->note;
12799
12800         # Generate an entry for each alias in this table.
12801         my $entry_for_first_alias;  # saves the first one encountered.
12802         foreach my $alias ($table->aliases) {
12803
12804             # Skip if not to go in pod.
12805             next unless $alias->make_pod_entry;
12806
12807             # Start gathering all the components for the entry
12808             my $name = $alias->name;
12809
12810             my $entry;      # Holds the left column, may include extras
12811             my $entry_ref;  # To refer to the left column's contents from
12812                             # another entry; has no extras
12813
12814             # First the left column of the pod entry.  Tables for the $perl
12815             # property always use the single form.
12816             if ($table_property == $perl) {
12817                 $entry = "\\p{$name}";
12818                 $entry_ref = "\\p{$name}";
12819             }
12820             else {    # Compound form.
12821
12822                 # Only generate one entry for all the aliases that mean true
12823                 # or false in binary properties.  Append a '*' to indicate
12824                 # some are missing.  (The heading comment notes this.)
12825                 my $wild_card_mark;
12826                 if ($type == $BINARY) {
12827                     next if $name ne 'N' && $name ne 'Y';
12828                     $wild_card_mark = '*';
12829                 }
12830                 else {
12831                     $wild_card_mark = "";
12832                 }
12833
12834                 # Colon-space is used to give a little more space to be easier
12835                 # to read;
12836                 $entry = "\\p{"
12837                         . $table_property_full_name
12838                         . ": $name$wild_card_mark}";
12839
12840                 # But for the reference to this entry, which will go in the
12841                 # right column, where space is at a premium, use equals
12842                 # without a space
12843                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
12844             }
12845
12846             # Then the right (info) column.  This is stored as components of
12847             # an array for the moment, then joined into a string later.  For
12848             # non-internal only properties, begin the info with the entry for
12849             # the first table we encountered (if any), as things are ordered
12850             # so that that one is the most descriptive.  This leads to the
12851             # info column of an entry being a more descriptive version of the
12852             # name column
12853             my @info;
12854             if ($name =~ /^_/) {
12855                 push @info,
12856                         '(For internal use by Perl, not necessarily stable)';
12857             }
12858             elsif ($entry_for_first_alias) {
12859                 push @info, $entry_for_first_alias;
12860             }
12861
12862             # If this entry is equivalent to another, add that to the info,
12863             # using the first such table we encountered
12864             if ($entry_for_first_table) {
12865                 if (@info) {
12866                     push @info, "(= $entry_for_first_table)";
12867                 }
12868                 else {
12869                     push @info, $entry_for_first_table;
12870                 }
12871             }
12872
12873             # If the name is a large integer, add an equivalent with an
12874             # exponent for better readability
12875             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
12876                 push @info, sprintf "(= %.1e)", $name
12877             }
12878
12879             my $parenthesized = "";
12880             if (! $entry_for_first_alias) {
12881
12882                 # This is the first alias for the current table.  The alias
12883                 # array is ordered so that this is the fullest, most
12884                 # descriptive alias, so it gets the fullest info.  The other
12885                 # aliases are mostly merely pointers to this one, using the
12886                 # information already added above.
12887
12888                 # Display any status message, but only on the parent table
12889                 if ($status && ! $entry_for_first_table) {
12890                     push @info, $status_info;
12891                 }
12892
12893                 # Put out any descriptive info
12894                 if (@table_description || @table_note) {
12895                     push @info, join "; ", @table_description, @table_note;
12896                 }
12897
12898                 # Look to see if there is a shorter name we can point people
12899                 # at
12900                 my $standard_name = standardize($name);
12901                 my $short_name;
12902                 my $proposed_short = $table->short_name;
12903                 if (defined $proposed_short) {
12904                     my $standard_short = standardize($proposed_short);
12905
12906                     # If the short name is shorter than the standard one, or
12907                     # even it it's not, but the combination of it and its
12908                     # short property name (as in \p{prop=short} ($perl doesn't
12909                     # have this form)) saves at least two characters, then,
12910                     # cause it to be listed as a shorter synonym.
12911                     if (length $standard_short < length $standard_name
12912                         || ($table_property != $perl
12913                             && (length($standard_short)
12914                                 - length($standard_name)
12915                                 + $table_property_short_delta)  # (<= 0)
12916                                 < -2))
12917                     {
12918                         $short_name = $proposed_short;
12919                         if ($table_property != $perl) {
12920                             $short_name = $table_property_short_name
12921                                           . "=$short_name";
12922                         }
12923                         $short_name = "\\p{$short_name}";
12924                     }
12925                 }
12926
12927                 # And if this is a compound form name, see if there is a
12928                 # single form equivalent
12929                 my $single_form;
12930                 if ($table_property != $perl) {
12931
12932                     # Special case the binary N tables, so that will print
12933                     # \P{single}, but use the Y table values to populate
12934                     # 'single', as we haven't likewise populated the N table.
12935                     my $test_table;
12936                     my $p;
12937                     if ($type == $BINARY
12938                         && $input_table == $property->table('No'))
12939                     {
12940                         $test_table = $property->table('Yes');
12941                         $p = 'P';
12942                     }
12943                     else {
12944                         $test_table = $input_table;
12945                         $p = 'p';
12946                     }
12947
12948                     # Look for a single form amongst all the children.
12949                     foreach my $table ($test_table->children) {
12950                         next if $table->property != $perl;
12951                         my $proposed_name = $table->short_name;
12952                         next if ! defined $proposed_name;
12953
12954                         # Don't mention internal-only properties as a possible
12955                         # single form synonym
12956                         next if substr($proposed_name, 0, 1) eq '_';
12957
12958                         $proposed_name = "\\$p\{$proposed_name}";
12959                         if (! defined $single_form
12960                             || length($proposed_name) < length $single_form)
12961                         {
12962                             $single_form = $proposed_name;
12963
12964                             # The goal here is to find a single form; not the
12965                             # shortest possible one.  We've already found a
12966                             # short name.  So, stop at the first single form
12967                             # found, which is likely to be closer to the
12968                             # original.
12969                             last;
12970                         }
12971                     }
12972                 }
12973
12974                 # Ouput both short and single in the same parenthesized
12975                 # expression, but with only one of 'Single', 'Short' if there
12976                 # are both items.
12977                 if ($short_name || $single_form || $table->conflicting) {
12978                     $parenthesized .= "Short: $short_name" if $short_name;
12979                     if ($short_name && $single_form) {
12980                         $parenthesized .= ', ';
12981                     }
12982                     elsif ($single_form) {
12983                         $parenthesized .= 'Single: ';
12984                     }
12985                     $parenthesized .= $single_form if $single_form;
12986                 }
12987             }
12988
12989             if ($caseless_equivalent != 0) {
12990                 $parenthesized .=  '; ' if $parenthesized ne "";
12991                 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
12992             }
12993
12994
12995             # Warn if this property isn't the same as one that a
12996             # semi-casual user might expect.  The other components of this
12997             # parenthesized structure are calculated only for the first entry
12998             # for this table, but the conflicting is deemed important enough
12999             # to go on every entry.
13000             my $conflicting = join " NOR ", $table->conflicting;
13001             if ($conflicting) {
13002                 $parenthesized .=  '; ' if $parenthesized ne "";
13003                 $parenthesized .= "NOT $conflicting";
13004             }
13005
13006             push @info, "($parenthesized)" if $parenthesized;
13007
13008             if ($name =~ /_$/ && $alias->loose_match) {
13009                 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
13010             }
13011
13012             if ($table_property != $perl && $table->perl_extension) {
13013                 push @info, '(Perl extension)';
13014             }
13015             push @info, "($string_count)";
13016
13017             # Now, we have both the entry and info so add them to the
13018             # list of all the properties.
13019             push @match_properties,
13020                 format_pod_line($indent_info_column,
13021                                 $entry,
13022                                 join( " ", @info),
13023                                 $alias->status,
13024                                 $alias->loose_match);
13025
13026             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
13027         } # End of looping through the aliases for this table.
13028
13029         if (! $entry_for_first_table) {
13030             $entry_for_first_table = $entry_for_first_alias;
13031         }
13032     } # End of looping through all the related tables
13033     return;
13034 }
13035
13036 sub pod_alphanumeric_sort {
13037     # Sort pod entries alphanumerically.
13038
13039     # The first few character columns are filler, plus the '\p{'; and get rid
13040     # of all the trailing stuff, starting with the trailing '}', so as to sort
13041     # on just 'Name=Value'
13042     (my $a = lc $a) =~ s/^ .*? { //x;
13043     $a =~ s/}.*//;
13044     (my $b = lc $b) =~ s/^ .*? { //x;
13045     $b =~ s/}.*//;
13046
13047     # Determine if the two operands are both internal only or both not.
13048     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
13049     # should be the underscore that begins internal only
13050     my $a_is_internal = (substr($a, 0, 1) eq '_');
13051     my $b_is_internal = (substr($b, 0, 1) eq '_');
13052
13053     # Sort so the internals come last in the table instead of first (which the
13054     # leading underscore would otherwise indicate).
13055     if ($a_is_internal != $b_is_internal) {
13056         return 1 if $a_is_internal;
13057         return -1
13058     }
13059
13060     # Determine if the two operands are numeric property values or not.
13061     # A numeric property will look like xyz: 3.  But the number
13062     # can begin with an optional minus sign, and may have a
13063     # fraction or rational component, like xyz: 3/2.  If either
13064     # isn't numeric, use alphabetic sort.
13065     my ($a_initial, $a_number) =
13066         ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
13067     return $a cmp $b unless defined $a_number;
13068     my ($b_initial, $b_number) =
13069         ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
13070     return $a cmp $b unless defined $b_number;
13071
13072     # Here they are both numeric, but use alphabetic sort if the
13073     # initial parts don't match
13074     return $a cmp $b if $a_initial ne $b_initial;
13075
13076     # Convert rationals to floating for the comparison.
13077     $a_number = eval $a_number if $a_number =~ qr{/};
13078     $b_number = eval $b_number if $b_number =~ qr{/};
13079
13080     return $a_number <=> $b_number;
13081 }
13082
13083 sub make_pod () {
13084     # Create the .pod file.  This generates the various subsections and then
13085     # combines them in one big HERE document.
13086
13087     return unless defined $pod_directory;
13088     print "Making pod file\n" if $verbosity >= $PROGRESS;
13089
13090     my $exception_message =
13091     '(Any exceptions are individually noted beginning with the word NOT.)';
13092     my @block_warning;
13093     if (-e 'Blocks.txt') {
13094
13095         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
13096         # if the global $has_In_conflicts indicates we have them.
13097         push @match_properties, format_pod_line($indent_info_column,
13098                                                 '\p{In_*}',
13099                                                 '\p{Block: *}'
13100                                                     . (($has_In_conflicts)
13101                                                       ? " $exception_message"
13102                                                       : ""));
13103         @block_warning = << "END";
13104
13105 Matches in the Block property have shortcuts that begin with "In_".  For
13106 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>.  For
13107 backward compatibility, if there is no conflict with another shortcut, these
13108 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>.  But, N.B., there
13109 are numerous such conflicting shortcuts.  Use of these forms for Block is
13110 discouraged, and are flagged as such, not only because of the potential
13111 confusion as to what is meant, but also because a later release of Unicode may
13112 preempt the shortcut, and your program would no longer be correct.  Use the
13113 "In_" form instead to avoid this, or even more clearly, use the compound form,
13114 e.g., C<\\p{blk:latin1}>.  See L<perlunicode/"Blocks"> for more information
13115 about this.
13116 END
13117     }
13118     my $text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
13119     $text = "$exception_message $text" if $has_Is_conflicts;
13120
13121     # And the 'Is_ line';
13122     push @match_properties, format_pod_line($indent_info_column,
13123                                             '\p{Is_*}',
13124                                             "\\p{*} $text");
13125
13126     # Sort the properties array for output.  It is sorted alphabetically
13127     # except numerically for numeric properties, and only output unique lines.
13128     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
13129
13130     my $formatted_properties = simple_fold(\@match_properties,
13131                                         "",
13132                                         # indent succeeding lines by two extra
13133                                         # which looks better
13134                                         $indent_info_column + 2,
13135
13136                                         # shorten the line length by how much
13137                                         # the formatter indents, so the folded
13138                                         # line will fit in the space
13139                                         # presumably available
13140                                         $automatic_pod_indent);
13141     # Add column headings, indented to be a little more centered, but not
13142     # exactly
13143     $formatted_properties =  format_pod_line($indent_info_column,
13144                                                     '    NAME',
13145                                                     '           INFO')
13146                                     . "\n"
13147                                     . $formatted_properties;
13148
13149     # Generate pod documentation lines for the tables that match nothing
13150     my $zero_matches = "";
13151     if (@zero_match_tables) {
13152         @zero_match_tables = uniques(@zero_match_tables);
13153         $zero_matches = join "\n\n",
13154                         map { $_ = '=item \p{' . $_->complete_name . "}" }
13155                             sort { $a->complete_name cmp $b->complete_name }
13156                             @zero_match_tables;
13157
13158         $zero_matches = <<END;
13159
13160 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
13161
13162 Unicode has some property-value pairs that currently don't match anything.
13163 This happens generally either because they are obsolete, or they exist for
13164 symmetry with other forms, but no language has yet been encoded that uses
13165 them.  In this version of Unicode, the following match zero code points:
13166
13167 =over 4
13168
13169 $zero_matches
13170
13171 =back
13172
13173 END
13174     }
13175
13176     # Generate list of properties that we don't accept, grouped by the reasons
13177     # why.  This is so only put out the 'why' once, and then list all the
13178     # properties that have that reason under it.
13179
13180     my %why_list;   # The keys are the reasons; the values are lists of
13181                     # properties that have the key as their reason
13182
13183     # For each property, add it to the list that are suppressed for its reason
13184     # The sort will cause the alphabetically first properties to be added to
13185     # each list first, so each list will be sorted.
13186     foreach my $property (sort keys %why_suppressed) {
13187         push @{$why_list{$why_suppressed{$property}}}, $property;
13188     }
13189
13190     # For each reason (sorted by the first property that has that reason)...
13191     my @bad_re_properties;
13192     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
13193                      keys %why_list)
13194     {
13195         # Add to the output, all the properties that have that reason.  Start
13196         # with an empty line.
13197         push @bad_re_properties, "\n\n";
13198
13199         my $has_item = 0;   # Flag if actually output anything.
13200         foreach my $name (@{$why_list{$why}}) {
13201
13202             # Split compound names into $property and $table components
13203             my $property = $name;
13204             my $table;
13205             if ($property =~ / (.*) = (.*) /x) {
13206                 $property = $1;
13207                 $table = $2;
13208             }
13209
13210             # This release of Unicode may not have a property that is
13211             # suppressed, so don't reference a non-existent one.
13212             $property = property_ref($property);
13213             next if ! defined $property;
13214
13215             # And since this list is only for match tables, don't list the
13216             # ones that don't have match tables.
13217             next if ! $property->to_create_match_tables;
13218
13219             # Find any abbreviation, and turn it into a compound name if this
13220             # is a property=value pair.
13221             my $short_name = $property->name;
13222             $short_name .= '=' . $property->table($table)->name if $table;
13223
13224             # And add the property as an item for the reason.
13225             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
13226             $has_item = 1;
13227         }
13228
13229         # And add the reason under the list of properties, if such a list
13230         # actually got generated.  Note that the header got added
13231         # unconditionally before.  But pod ignores extra blank lines, so no
13232         # harm.
13233         push @bad_re_properties, "\n$why\n" if $has_item;
13234
13235     } # End of looping through each reason.
13236
13237     # Generate a list of the properties whose map table we output, from the
13238     # global @map_properties.
13239     my @map_tables_actually_output;
13240     my $info_indent = 20;       # Left column is narrower than \p{} table.
13241     foreach my $property (@map_properties) {
13242
13243         # Get the path to the file; don't output any not in the standard
13244         # directory.
13245         my @path = $property->file_path;
13246         next if $path[0] ne $map_directory;
13247
13248         # Don't mention map tables that are for internal-use only
13249         next if $property->to_output_map == $INTERNAL_MAP;
13250
13251         shift @path;    # Remove the standard name
13252
13253         my $file = join '/', @path; # In case is in sub directory
13254         my $info = $property->full_name;
13255         my $short_name = $property->name;
13256         if ($info ne $short_name) {
13257             $info .= " ($short_name)";
13258         }
13259         foreach my $more_info ($property->description,
13260                                $property->note,
13261                                $property->status_info)
13262         {
13263             next unless $more_info;
13264             $info =~ s/\.\Z//;
13265             $info .= ".  $more_info";
13266         }
13267         push @map_tables_actually_output, format_pod_line($info_indent,
13268                                                           $file,
13269                                                           $info,
13270                                                           $property->status);
13271     }
13272
13273     # Sort alphabetically, and fold for output
13274     @map_tables_actually_output = sort
13275                             pod_alphanumeric_sort @map_tables_actually_output;
13276     @map_tables_actually_output
13277                         = simple_fold(\@map_tables_actually_output,
13278                                         ' ',
13279                                         $info_indent,
13280                                         $automatic_pod_indent);
13281
13282     # Generate a list of the formats that can appear in the map tables.
13283     my @map_table_formats;
13284     foreach my $format (sort keys %map_table_formats) {
13285         push @map_table_formats, "  $format    $map_table_formats{$format}\n";
13286     }
13287
13288     local $" = "";
13289
13290     # Everything is ready to assemble.
13291     my @OUT = << "END";
13292 =begin comment
13293
13294 $HEADER
13295
13296 To change this file, edit $0 instead.
13297
13298 =end comment
13299
13300 =head1 NAME
13301
13302 $pod_file - Index of Unicode Version $string_version properties in Perl
13303
13304 =head1 DESCRIPTION
13305
13306 There are many properties in Unicode, and Perl provides access to almost all of
13307 them, as well as some additional extensions and short-cut synonyms.
13308
13309 And just about all of the few that aren't accessible through the Perl
13310 core are accessible through the modules: L<Unicode::Normalize> and
13311 L<Unicode::UCD>, and for Unihan properties, via the CPAN module
13312 L<Unicode::Unihan>.
13313
13314 This document merely lists all available properties and does not attempt to
13315 explain what each property really means.  There is a brief description of each
13316 Perl extension.  There is some detail about Blocks, Scripts, General_Category,
13317 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
13318 Unicode properties, refer to the Unicode standard.  A good starting place is
13319 L<$unicode_reference_url>.  More information on the Perl extensions is in
13320 L<perlunicode/Other Properties>.
13321
13322 Note that you can define your own properties; see
13323 L<perlunicode/"User-Defined Character Properties">.
13324
13325 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
13326
13327 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
13328 most of the Unicode character properties.  The table below shows all these
13329 constructs, both single and compound forms.
13330
13331 B<Compound forms> consist of two components, separated by an equals sign or a
13332 colon.  The first component is the property name, and the second component is
13333 the particular value of the property to match against, for example,
13334 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
13335 whose Script property is Greek.
13336
13337 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
13338 their equivalent compound forms.  The table shows these equivalences.  (In our
13339 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
13340 There are also a few Perl-defined single forms that are not shortcuts for a
13341 compound form.  One such is C<\\p{Word}>.  These are also listed in the table.
13342
13343 In parsing these constructs, Perl always ignores Upper/lower case differences
13344 everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
13345 C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
13346 the left brace completely changes the meaning of the construct, from "match"
13347 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
13348 for improved legibility.
13349
13350 Also, white space, hyphens, and underscores are also normally ignored
13351 everywhere between the {braces}, and hence can be freely added or removed
13352 even if the C</x> modifier hasn't been specified on the regular expression.
13353 But $a_bold_stricter at the beginning of an entry in the table below
13354 means that tighter (stricter) rules are used for that entry:
13355
13356 =over 4
13357
13358 =item Single form (C<\\p{name}>) tighter rules:
13359
13360 White space, hyphens, and underscores ARE significant
13361 except for:
13362
13363 =over 4
13364
13365 =item * white space adjacent to a non-word character
13366
13367 =item * underscores separating digits in numbers
13368
13369 =back
13370
13371 That means, for example, that you can freely add or remove white space
13372 adjacent to (but within) the braces without affecting the meaning.
13373
13374 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
13375
13376 The tighter rules given above for the single form apply to everything to the
13377 right of the colon or equals; the looser rules still apply to everything to
13378 the left.
13379
13380 That means, for example, that you can freely add or remove white space
13381 adjacent to (but within) the braces and the colon or equal sign.
13382
13383 =back
13384
13385 Some properties are considered obsolete by Unicode, but still available.
13386 There are several varieties of obsolescence:
13387
13388 =over 4
13389
13390 =item Stabilized
13391
13392 Obsolete properties may be stabilized.  Such a determination does not indicate
13393 that the property should or should not be used; instead it is a declaration
13394 that the property will not be maintained nor extended for newly encoded
13395 characters.  Such properties are marked with $a_bold_stabilized in the
13396 table.
13397
13398 =item Deprecated
13399
13400 An obsolete property may be deprecated, perhaps because its original intent
13401 has been replaced by another property, or because its specification was
13402 somehow defective.  This means that its use is strongly
13403 discouraged, so much so that a warning will be issued if used, unless the
13404 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
13405 statement.  $A_bold_deprecated flags each such entry in the table, and
13406 the entry there for the longest, most descriptive version of the property will
13407 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
13408 warning, even for properties that aren't officially deprecated by Unicode,
13409 when there used to be characters or code points that were matched by them, but
13410 no longer.  This is to warn you that your program may not work like it did on
13411 earlier Unicode releases.
13412
13413 A deprecated property may be made unavailable in a future Perl version, so it
13414 is best to move away from them.
13415
13416 A deprecated property may also be stabilized, but this fact is not shown.
13417
13418 =item Obsolete
13419
13420 Properties marked with $a_bold_obsolete in the table are considered (plain)
13421 obsolete.  Generally this designation is given to properties that Unicode once
13422 used for internal purposes (but not any longer).
13423
13424 =back
13425
13426 Some Perl extensions are present for backwards compatibility and are
13427 discouraged from being used, but are not obsolete.  $A_bold_discouraged
13428 flags each such entry in the table.  Future Unicode versions may force
13429 some of these extensions to be removed without warning, replaced by another
13430 property with the same name that means something different.  Use the
13431 equivalent shown instead.
13432
13433 @block_warning
13434
13435 The table below has two columns.  The left column contains the C<\\p{}>
13436 constructs to look up, possibly preceded by the flags mentioned above; and
13437 the right column contains information about them, like a description, or
13438 synonyms.  It shows both the single and compound forms for each property that
13439 has them.  If the left column is a short name for a property, the right column
13440 will give its longer, more descriptive name; and if the left column is the
13441 longest name, the right column will show any equivalent shortest name, in both
13442 single and compound forms if applicable.
13443
13444 The right column will also caution you if a property means something different
13445 than what might normally be expected.
13446
13447 All single forms are Perl extensions; a few compound forms are as well, and
13448 are noted as such.
13449
13450 Numbers in (parentheses) indicate the total number of code points matched by
13451 the property.  For emphasis, those properties that match no code points at all
13452 are listed as well in a separate section following the table.
13453
13454 Most properties match the same code points regardless of whether C<"/i">
13455 case-insensitive matching is specified or not.  But a few properties are
13456 affected.  These are shown with the notation
13457
13458  (/i= other_property)
13459
13460 in the second column.  Under case-insensitive matching they match the
13461 same code pode points as the property "other_property".
13462
13463 There is no description given for most non-Perl defined properties (See
13464 L<$unicode_reference_url> for that).
13465
13466 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
13467 combinations.  For example, entries like:
13468
13469  \\p{Gc: *}                                  \\p{General_Category: *}
13470
13471 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
13472 for the latter is also valid for the former.  Similarly,
13473
13474  \\p{Is_*}                                   \\p{*}
13475
13476 means that if and only if, for example, C<\\p{Foo}> exists, then
13477 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
13478 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
13479 C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
13480 underscore.
13481
13482 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
13483 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
13484 'N*' to indicate this, and doesn't have separate entries for the other
13485 possibilities.  Note that not all properties which have values 'Yes' and 'No'
13486 are binary, and they have all their values spelled out without using this wild
13487 card, and a C<NOT> clause in their description that highlights their not being
13488 binary.  These also require the compound form to match them, whereas true
13489 binary properties have both single and compound forms available.
13490
13491 Note that all non-essential underscores are removed in the display of the
13492 short names below.
13493
13494 B<Legend summary:>
13495
13496 =over 4
13497
13498 =item Z<>B<*> is a wild-card
13499
13500 =item B<(\\d+)> in the info column gives the number of code points matched by
13501 this property.
13502
13503 =item B<$DEPRECATED> means this is deprecated.
13504
13505 =item B<$OBSOLETE> means this is obsolete.
13506
13507 =item B<$STABILIZED> means this is stabilized.
13508
13509 =item B<$STRICTER> means tighter (stricter) name matching applies.
13510
13511 =item B<$DISCOURAGED> means use of this form is discouraged, and may not be
13512 stable.
13513
13514 =back
13515
13516 $formatted_properties
13517
13518 $zero_matches
13519
13520 =head1 Properties not accessible through \\p{} and \\P{}
13521
13522 A few properties are accessible in Perl via various function calls only.
13523 These are:
13524
13525  Lowercase_Mapping          lc() and lcfirst()
13526  Titlecase_Mapping          ucfirst()
13527  Uppercase_Mapping          uc()
13528
13529 Case_Folding is accessible through the C</i> modifier in regular expressions.
13530
13531 The Name property is accessible through the C<\\N{}> interpolation in
13532 double-quoted strings and regular expressions, but both usages require a C<use
13533 charnames;> to be specified, which also contains related functions viacode(),
13534 vianame(), and string_vianame().
13535
13536 =head1 Unicode regular expression properties that are NOT accepted by Perl
13537
13538 Perl will generate an error for a few character properties in Unicode when
13539 used in a regular expression.  The non-Unihan ones are listed below, with the
13540 reasons they are not accepted, perhaps with work-arounds.  The short names for
13541 the properties are listed enclosed in (parentheses).
13542 As described after the list, an installation can change the defaults and choose
13543 to accept any of these.  The list is machine generated based on the
13544 choices made for the installation that generated this document.
13545
13546 =over 4
13547
13548 @bad_re_properties
13549
13550 =back
13551
13552 An installation can choose to allow any of these to be matched by downloading
13553 the Unicode database from L<http://www.unicode.org/Public/> to
13554 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
13555 controlling lists contained in the program
13556 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
13557 (C<\%Config> is available from the Config module).
13558
13559 =head1 Files in the I<To> directory (for serious hackers only)
13560
13561 All Unicode properties are really mappings (in the mathematical sense) from
13562 code points to their respective values.  As part of its build process,
13563 Perl constructs tables containing these mappings for all properties that it
13564 deals with.  Some, but not all, of these are written out into files.
13565 Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
13566 (C<%Config> is available from the C<Config> module).
13567
13568 Perl reserves the right to change the format and even the existence of any of
13569 those files without notice, except the ones that were in existence prior to
13570 release 5.14.  If those change, a deprecation cycle will be done first.  These
13571 are:
13572
13573 @map_tables_actually_output
13574
13575 Each of the files in this directory defines several hash entries to help
13576 reading programs decipher it.  One of them looks like this:
13577
13578     \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
13579
13580 where "NAME" is a name to indicate the property.  For backwards compatibility,
13581 this is not necessarily the property's official Unicode name.  (The "To" is
13582 also for backwards compatibility.)  The hash entry gives the format of the
13583 mapping fields of the table, currently one of the following:
13584
13585 @map_table_formats
13586
13587 This format applies only to the entries in the main body of the table.
13588 Entries defined in hashes or ones that are missing from the list can have a
13589 different format.
13590
13591 The value that the missing entries have is given by another SwashInfo hash
13592 entry line; it looks like this:
13593
13594     \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
13595
13596 This example line says that any Unicode code points not explicitly listed in
13597 the file have the value "NaN" under the property indicated by NAME.  If the
13598 value is the special string C<< <code point> >>, it means that the value for
13599 any missing code point is the code point itself.  This happens, for example,
13600 in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
13601 character "A", are missing because the uppercase of "A" is itself.
13602
13603 Finally, if the file contains a hash for special case entries, its name is
13604 specified by an entry that looks like this:
13605
13606     \$utf8::SwashInfo{'ToNAME'}{'specials_name'} = 'utf8::ToSpecNAME';
13607
13608 =head1 SEE ALSO
13609
13610 L<$unicode_reference_url>
13611
13612 L<perlrecharclass>
13613
13614 L<perlunicode>
13615
13616 END
13617
13618     # And write it.  The 0 means no utf8.
13619     main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
13620     return;
13621 }
13622
13623 sub make_Heavy () {
13624     # Create and write Heavy.pl, which passes info about the tables to
13625     # utf8_heavy.pl
13626
13627     my @heavy = <<END;
13628 $HEADER
13629 $INTERNAL_ONLY
13630
13631 # This file is for the use of utf8_heavy.pl
13632
13633 # Maps Unicode (not Perl single-form extensions) property names in loose
13634 # standard form to their corresponding standard names
13635 \%utf8::loose_property_name_of = (
13636 END
13637
13638     push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
13639     push @heavy, <<END;
13640 );
13641
13642 # Maps property, table to file for those using stricter matching
13643 \%utf8::stricter_to_file_of = (
13644 END
13645     push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
13646     push @heavy, <<END;
13647 );
13648
13649 # Maps property, table to file for those using loose matching
13650 \%utf8::loose_to_file_of = (
13651 END
13652     push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
13653     push @heavy, <<END;
13654 );
13655
13656 # Maps floating point to fractional form
13657 \%utf8::nv_floating_to_rational = (
13658 END
13659     push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
13660     push @heavy, <<END;
13661 );
13662
13663 # If a floating point number doesn't have enough digits in it to get this
13664 # close to a fraction, it isn't considered to be that fraction even if all the
13665 # digits it does have match.
13666 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
13667
13668 # Deprecated tables to generate a warning for.  The key is the file containing
13669 # the table, so as to avoid duplication, as many property names can map to the
13670 # file, but we only need one entry for all of them.
13671 \%utf8::why_deprecated = (
13672 END
13673
13674     push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
13675     push @heavy, <<END;
13676 );
13677
13678 # A few properties have different behavior under /i matching.  This maps the
13679 # those to substitute files to use under /i.
13680 \%utf8::caseless_equivalent = (
13681 END
13682
13683     # We set the key to the file when we associated files with tables, but we
13684     # couldn't do the same for the value then, as we might not have the file
13685     # for the alternate table figured out at that time.
13686     foreach my $cased (keys %caseless_equivalent_to) {
13687         my @path = $caseless_equivalent_to{$cased}->file_path;
13688         my $path = join '/', @path[1, -1];
13689         $utf8::caseless_equivalent_to{$cased} = $path;
13690     }
13691     push @heavy, simple_dumper (\%utf8::caseless_equivalent_to, ' ' x 4);
13692     push @heavy, <<END;
13693 );
13694
13695 1;
13696 END
13697
13698     main::write("Heavy.pl", 0, \@heavy);  # The 0 means no utf8.
13699     return;
13700 }
13701
13702 sub write_all_tables() {
13703     # Write out all the tables generated by this program to files, as well as
13704     # the supporting data structures, pod file, and .t file.
13705
13706     my @writables;              # List of tables that actually get written
13707     my %match_tables_to_write;  # Used to collapse identical match tables
13708                                 # into one file.  Each key is a hash function
13709                                 # result to partition tables into buckets.
13710                                 # Each value is an array of the tables that
13711                                 # fit in the bucket.
13712
13713     # For each property ...
13714     # (sort so that if there is an immutable file name, it has precedence, so
13715     # some other property can't come in and take over its file name.  If b's
13716     # file name is defined, will return 1, meaning to take it first; don't
13717     # care if both defined, as they had better be different anyway.  And the
13718     # property named 'Perl' needs to be first (it doesn't have any immutable
13719     # file name) because empty properties are defined in terms of it's table
13720     # named 'Any'.)
13721     PROPERTY:
13722     foreach my $property (sort { return -1 if $a == $perl;
13723                                  return 1 if $b == $perl;
13724                                  return defined $b->file
13725                                 } property_ref('*'))
13726     {
13727         my $type = $property->type;
13728
13729         # And for each table for that property, starting with the mapping
13730         # table for it ...
13731         TABLE:
13732         foreach my $table($property,
13733
13734                         # and all the match tables for it (if any), sorted so
13735                         # the ones with the shortest associated file name come
13736                         # first.  The length sorting prevents problems of a
13737                         # longer file taking a name that might have to be used
13738                         # by a shorter one.  The alphabetic sorting prevents
13739                         # differences between releases
13740                         sort {  my $ext_a = $a->external_name;
13741                                 return 1 if ! defined $ext_a;
13742                                 my $ext_b = $b->external_name;
13743                                 return -1 if ! defined $ext_b;
13744
13745                                 # But return the non-complement table before
13746                                 # the complement one, as the latter is defined
13747                                 # in terms of the former, and needs to have
13748                                 # the information for the former available.
13749                                 return 1 if $a->complement != 0;
13750                                 return -1 if $b->complement != 0;
13751
13752                                 my $cmp = length $ext_a <=> length $ext_b;
13753
13754                                 # Return result if lengths not equal
13755                                 return $cmp if $cmp;
13756
13757                                 # Alphabetic if lengths equal
13758                                 return $ext_a cmp $ext_b
13759                         } $property->tables
13760                     )
13761         {
13762
13763             # Here we have a table associated with a property.  It could be
13764             # the map table (done first for each property), or one of the
13765             # other tables.  Determine which type.
13766             my $is_property = $table->isa('Property');
13767
13768             my $name = $table->name;
13769             my $complete_name = $table->complete_name;
13770
13771             # See if should suppress the table if is empty, but warn if it
13772             # contains something.
13773             my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
13774                                     keys %why_suppress_if_empty_warn_if_not;
13775
13776             # Calculate if this table should have any code points associated
13777             # with it or not.
13778             my $expected_empty =
13779
13780                 # $perl should be empty, as well as properties that we just
13781                 # don't do anything with
13782                 ($is_property
13783                     && ($table == $perl
13784                         || grep { $complete_name eq $_ }
13785                                                     @unimplemented_properties
13786                     )
13787                 )
13788
13789                 # Match tables in properties we skipped populating should be
13790                 # empty
13791                 || (! $is_property && ! $property->to_create_match_tables)
13792
13793                 # Tables and properties that are expected to have no code
13794                 # points should be empty
13795                 || $suppress_if_empty_warn_if_not
13796             ;
13797
13798             # Set a boolean if this table is the complement of an empty binary
13799             # table
13800             my $is_complement_of_empty_binary =
13801                 $type == $BINARY &&
13802                 (($table == $property->table('Y')
13803                     && $property->table('N')->is_empty)
13804                 || ($table == $property->table('N')
13805                     && $property->table('Y')->is_empty));
13806
13807
13808             # Some tables should match everything
13809             my $expected_full =
13810                 ($is_property)
13811                 ? # All these types of map tables will be full because
13812                   # they will have been populated with defaults
13813                   ($type == $ENUM || $type == $BINARY)
13814
13815                 : # A match table should match everything if its method
13816                   # shows it should
13817                   ($table->matches_all
13818
13819                   # The complement of an empty binary table will match
13820                   # everything
13821                   || $is_complement_of_empty_binary
13822                   )
13823             ;
13824
13825             if ($table->is_empty) {
13826
13827                 if ($suppress_if_empty_warn_if_not) {
13828                     $table->set_status($SUPPRESSED,
13829                         $why_suppress_if_empty_warn_if_not{$complete_name});
13830                 }
13831
13832                 # Suppress (by skipping them) expected empty tables.
13833                 next TABLE if $expected_empty;
13834
13835                 # And setup to later output a warning for those that aren't
13836                 # known to be allowed to be empty.  Don't do the warning if
13837                 # this table is a child of another one to avoid duplicating
13838                 # the warning that should come from the parent one.
13839                 if (($table == $property || $table->parent == $table)
13840                     && $table->status ne $SUPPRESSED
13841                     && ! grep { $complete_name =~ /^$_$/ }
13842                                                     @tables_that_may_be_empty)
13843                 {
13844                     push @unhandled_properties, "$table";
13845                 }
13846
13847                 # An empty table is just the complement of everything.
13848                 $table->set_complement($Any) if $table != $property;
13849             }
13850             elsif ($expected_empty) {
13851                 my $because = "";
13852                 if ($suppress_if_empty_warn_if_not) {
13853                     $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
13854                 }
13855
13856                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
13857             }
13858
13859             my $count = $table->count;
13860             if ($expected_full) {
13861                 if ($count != $MAX_UNICODE_CODEPOINTS) {
13862                     Carp::my_carp("$table matches only "
13863                     . clarify_number($count)
13864                     . " Unicode code points but should match "
13865                     . clarify_number($MAX_UNICODE_CODEPOINTS)
13866                     . " (off by "
13867                     .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
13868                     . ").  Proceeding anyway.");
13869                 }
13870
13871                 # Here is expected to be full.  If it is because it is the
13872                 # complement of an (empty) binary table that is to be
13873                 # suppressed, then suppress this one as well.
13874                 if ($is_complement_of_empty_binary) {
13875                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
13876                     my $opposing = $property->table($opposing_name);
13877                     my $opposing_status = $opposing->status;
13878                     if ($opposing_status) {
13879                         $table->set_status($opposing_status,
13880                                            $opposing->status_info);
13881                     }
13882                 }
13883             }
13884             elsif ($count == $MAX_UNICODE_CODEPOINTS) {
13885                 if ($table == $property || $table->leader == $table) {
13886                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
13887                 }
13888             }
13889
13890             if ($table->status eq $SUPPRESSED) {
13891                 if (! $is_property) {
13892                     my @children = $table->children;
13893                     foreach my $child (@children) {
13894                         if ($child->status ne $SUPPRESSED) {
13895                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
13896                         }
13897                     }
13898                 }
13899                 next TABLE;
13900
13901             }
13902             if (! $is_property) {
13903
13904                 # Several things need to be done just once for each related
13905                 # group of match tables.  Do them on the parent.
13906                 if ($table->parent == $table) {
13907
13908                     # Add an entry in the pod file for the table; it also does
13909                     # the children.
13910                     make_table_pod_entries($table) if defined $pod_directory;
13911
13912                     # See if the the table matches identical code points with
13913                     # something that has already been output.  In that case,
13914                     # no need to have two files with the same code points in
13915                     # them.  We use the table's hash() method to store these
13916                     # in buckets, so that it is quite likely that if two
13917                     # tables are in the same bucket they will be identical, so
13918                     # don't have to compare tables frequently.  The tables
13919                     # have to have the same status to share a file, so add
13920                     # this to the bucket hash.  (The reason for this latter is
13921                     # that Heavy.pl associates a status with a file.)
13922                     my $hash = $table->hash . ';' . $table->status;
13923
13924                     # Look at each table that is in the same bucket as this
13925                     # one would be.
13926                     foreach my $comparison (@{$match_tables_to_write{$hash}})
13927                     {
13928                         if ($table->matches_identically_to($comparison)) {
13929                             $table->set_equivalent_to($comparison,
13930                                                                 Related => 0);
13931                             next TABLE;
13932                         }
13933                     }
13934
13935                     # Here, not equivalent, add this table to the bucket.
13936                     push @{$match_tables_to_write{$hash}}, $table;
13937                 }
13938             }
13939             else {
13940
13941                 # Here is the property itself.
13942                 # Don't write out or make references to the $perl property
13943                 next if $table == $perl;
13944
13945                 if ($type != $STRING) {
13946
13947                     # There is a mapping stored of the various synonyms to the
13948                     # standardized name of the property for utf8_heavy.pl.
13949                     # Also, the pod file contains entries of the form:
13950                     # \p{alias: *}         \p{full: *}
13951                     # rather than show every possible combination of things.
13952
13953                     my @property_aliases = $property->aliases;
13954
13955                     # The full name of this property is stored by convention
13956                     # first in the alias array
13957                     my $full_property_name =
13958                                 '\p{' . $property_aliases[0]->name . ': *}';
13959                     my $standard_property_name = standardize($table->name);
13960
13961                     # For each synonym ...
13962                     for my $i (0 .. @property_aliases - 1)  {
13963                         my $alias = $property_aliases[$i];
13964                         my $alias_name = $alias->name;
13965                         my $alias_standard = standardize($alias_name);
13966
13967                         # For utf8_heavy, set the mapping of the alias to the
13968                         # property
13969                         if (exists ($loose_property_name_of{$alias_standard}))
13970                         {
13971                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
13972                         }
13973                         else {
13974                             $loose_property_name_of{$alias_standard}
13975                                                 = $standard_property_name;
13976                         }
13977
13978                         # Now for the pod entry for this alias.  Skip if not
13979                         # outputting a pod; skip the first one, which is the
13980                         # full name so won't have an entry like: '\p{full: *}
13981                         # \p{full: *}', and skip if don't want an entry for
13982                         # this one.
13983                         next if $i == 0
13984                                 || ! defined $pod_directory
13985                                 || ! $alias->make_pod_entry;
13986
13987                         my $rhs = $full_property_name;
13988                         if ($property != $perl && $table->perl_extension) {
13989                             $rhs .= ' (Perl extension)';
13990                         }
13991                         push @match_properties,
13992                             format_pod_line($indent_info_column,
13993                                         '\p{' . $alias->name . ': *}',
13994                                         $rhs,
13995                                         $alias->status);
13996                     }
13997                 } # End of non-string-like property code
13998
13999
14000                 # Don't write out a mapping file if not desired.
14001                 next if ! $property->to_output_map;
14002             }
14003
14004             # Here, we know we want to write out the table, but don't do it
14005             # yet because there may be other tables that come along and will
14006             # want to share the file, and the file's comments will change to
14007             # mention them.  So save for later.
14008             push @writables, $table;
14009
14010         } # End of looping through the property and all its tables.
14011     } # End of looping through all properties.
14012
14013     # Now have all the tables that will have files written for them.  Do it.
14014     foreach my $table (@writables) {
14015         my @directory;
14016         my $filename;
14017         my $property = $table->property;
14018         my $is_property = ($table == $property);
14019         if (! $is_property) {
14020
14021             # Match tables for the property go in lib/$subdirectory, which is
14022             # the property's name.  Don't use the standard file name for this,
14023             # as may get an unfamiliar alias
14024             @directory = ($matches_directory, $property->external_name);
14025         }
14026         else {
14027
14028             @directory = $table->directory;
14029             $filename = $table->file;
14030         }
14031
14032         # Use specified filename if available, or default to property's
14033         # shortest name.  We need an 8.3 safe filename (which means "an 8
14034         # safe" filename, since after the dot is only 'pl', which is < 3)
14035         # The 2nd parameter is if the filename shouldn't be changed, and
14036         # it shouldn't iff there is a hard-coded name for this table.
14037         $filename = construct_filename(
14038                                 $filename || $table->external_name,
14039                                 ! $filename,    # mutable if no filename
14040                                 \@directory);
14041
14042         register_file_for_name($table, \@directory, $filename);
14043
14044         # Only need to write one file when shared by more than one
14045         # property
14046         next if ! $is_property
14047                 && ($table->leader != $table || $table->complement != 0);
14048
14049         # Construct a nice comment to add to the file
14050         $table->set_final_comment;
14051
14052         $table->write;
14053     }
14054
14055
14056     # Write out the pod file
14057     make_pod;
14058
14059     # And Heavy.pl
14060     make_Heavy;
14061
14062     make_property_test_script() if $make_test_script;
14063     return;
14064 }
14065
14066 my @white_space_separators = ( # This used only for making the test script.
14067                             "",
14068                             ' ',
14069                             "\t",
14070                             '   '
14071                         );
14072
14073 sub generate_separator($) {
14074     # This used only for making the test script.  It generates the colon or
14075     # equal separator between the property and property value, with random
14076     # white space surrounding the separator
14077
14078     my $lhs = shift;
14079
14080     return "" if $lhs eq "";  # No separator if there's only one (the r) side
14081
14082     # Choose space before and after randomly
14083     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
14084     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
14085
14086     # And return the whole complex, half the time using a colon, half the
14087     # equals
14088     return $spaces_before
14089             . (rand() < 0.5) ? '=' : ':'
14090             . $spaces_after;
14091 }
14092
14093 sub generate_tests($$$$$) {
14094     # This used only for making the test script.  It generates test cases that
14095     # are expected to compile successfully in perl.  Note that the lhs and
14096     # rhs are assumed to already be as randomized as the caller wants.
14097
14098     my $lhs = shift;           # The property: what's to the left of the colon
14099                                #  or equals separator
14100     my $rhs = shift;           # The property value; what's to the right
14101     my $valid_code = shift;    # A code point that's known to be in the
14102                                # table given by lhs=rhs; undef if table is
14103                                # empty
14104     my $invalid_code = shift;  # A code point known to not be in the table;
14105                                # undef if the table is all code points
14106     my $warning = shift;
14107
14108     # Get the colon or equal
14109     my $separator = generate_separator($lhs);
14110
14111     # The whole 'property=value'
14112     my $name = "$lhs$separator$rhs";
14113
14114     my @output;
14115     # Create a complete set of tests, with complements.
14116     if (defined $valid_code) {
14117         push @output, <<"EOC"
14118 Expect(1, $valid_code, '\\p{$name}', $warning);
14119 Expect(0, $valid_code, '\\p{^$name}', $warning);
14120 Expect(0, $valid_code, '\\P{$name}', $warning);
14121 Expect(1, $valid_code, '\\P{^$name}', $warning);
14122 EOC
14123     }
14124     if (defined $invalid_code) {
14125         push @output, <<"EOC"
14126 Expect(0, $invalid_code, '\\p{$name}', $warning);
14127 Expect(1, $invalid_code, '\\p{^$name}', $warning);
14128 Expect(1, $invalid_code, '\\P{$name}', $warning);
14129 Expect(0, $invalid_code, '\\P{^$name}', $warning);
14130 EOC
14131     }
14132     return @output;
14133 }
14134
14135 sub generate_error($$$) {
14136     # This used only for making the test script.  It generates test cases that
14137     # are expected to not only not match, but to be syntax or similar errors
14138
14139     my $lhs = shift;                # The property: what's to the left of the
14140                                     # colon or equals separator
14141     my $rhs = shift;                # The property value; what's to the right
14142     my $already_in_error = shift;   # Boolean; if true it's known that the
14143                                 # unmodified lhs and rhs will cause an error.
14144                                 # This routine should not force another one
14145     # Get the colon or equal
14146     my $separator = generate_separator($lhs);
14147
14148     # Since this is an error only, don't bother to randomly decide whether to
14149     # put the error on the left or right side; and assume that the rhs is
14150     # loosely matched, again for convenience rather than rigor.
14151     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
14152
14153     my $property = $lhs . $separator . $rhs;
14154
14155     return <<"EOC";
14156 Error('\\p{$property}');
14157 Error('\\P{$property}');
14158 EOC
14159 }
14160
14161 # These are used only for making the test script
14162 # XXX Maybe should also have a bad strict seps, which includes underscore.
14163
14164 my @good_loose_seps = (
14165             " ",
14166             "-",
14167             "\t",
14168             "",
14169             "_",
14170            );
14171 my @bad_loose_seps = (
14172            "/a/",
14173            ':=',
14174           );
14175
14176 sub randomize_stricter_name {
14177     # This used only for making the test script.  Take the input name and
14178     # return a randomized, but valid version of it under the stricter matching
14179     # rules.
14180
14181     my $name = shift;
14182     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14183
14184     # If the name looks like a number (integer, floating, or rational), do
14185     # some extra work
14186     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
14187         my $sign = $1;
14188         my $number = $2;
14189         my $separator = $3;
14190
14191         # If there isn't a sign, part of the time add a plus
14192         # Note: Not testing having any denominator having a minus sign
14193         if (! $sign) {
14194             $sign = '+' if rand() <= .3;
14195         }
14196
14197         # And add 0 or more leading zeros.
14198         $name = $sign . ('0' x int rand(10)) . $number;
14199
14200         if (defined $separator) {
14201             my $extra_zeros = '0' x int rand(10);
14202
14203             if ($separator eq '.') {
14204
14205                 # Similarly, add 0 or more trailing zeros after a decimal
14206                 # point
14207                 $name .= $extra_zeros;
14208             }
14209             else {
14210
14211                 # Or, leading zeros before the denominator
14212                 $name =~ s,/,/$extra_zeros,;
14213             }
14214         }
14215     }
14216
14217     # For legibility of the test, only change the case of whole sections at a
14218     # time.  To do this, first split into sections.  The split returns the
14219     # delimiters
14220     my @sections;
14221     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
14222         trace $section if main::DEBUG && $to_trace;
14223
14224         if (length $section > 1 && $section !~ /\D/) {
14225
14226             # If the section is a sequence of digits, about half the time
14227             # randomly add underscores between some of them.
14228             if (rand() > .5) {
14229
14230                 # Figure out how many underscores to add.  max is 1 less than
14231                 # the number of digits.  (But add 1 at the end to make sure
14232                 # result isn't 0, and compensate earlier by subtracting 2
14233                 # instead of 1)
14234                 my $num_underscores = int rand(length($section) - 2) + 1;
14235
14236                 # And add them evenly throughout, for convenience, not rigor
14237                 use integer;
14238                 my $spacing = (length($section) - 1)/ $num_underscores;
14239                 my $temp = $section;
14240                 $section = "";
14241                 for my $i (1 .. $num_underscores) {
14242                     $section .= substr($temp, 0, $spacing, "") . '_';
14243                 }
14244                 $section .= $temp;
14245             }
14246             push @sections, $section;
14247         }
14248         else {
14249
14250             # Here not a sequence of digits.  Change the case of the section
14251             # randomly
14252             my $switch = int rand(4);
14253             if ($switch == 0) {
14254                 push @sections, uc $section;
14255             }
14256             elsif ($switch == 1) {
14257                 push @sections, lc $section;
14258             }
14259             elsif ($switch == 2) {
14260                 push @sections, ucfirst $section;
14261             }
14262             else {
14263                 push @sections, $section;
14264             }
14265         }
14266     }
14267     trace "returning", join "", @sections if main::DEBUG && $to_trace;
14268     return join "", @sections;
14269 }
14270
14271 sub randomize_loose_name($;$) {
14272     # This used only for making the test script
14273
14274     my $name = shift;
14275     my $want_error = shift;  # if true, make an error
14276     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14277
14278     $name = randomize_stricter_name($name);
14279
14280     my @parts;
14281     push @parts, $good_loose_seps[rand(@good_loose_seps)];
14282
14283     # Preserve trailing ones for the sake of not stripping the underscore from
14284     # 'L_'
14285     for my $part (split /[-\s_]+ (?= . )/, $name) {
14286         if (@parts) {
14287             if ($want_error and rand() < 0.3) {
14288                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
14289                 $want_error = 0;
14290             }
14291             else {
14292                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
14293             }
14294         }
14295         push @parts, $part;
14296     }
14297     my $new = join("", @parts);
14298     trace "$name => $new" if main::DEBUG && $to_trace;
14299
14300     if ($want_error) {
14301         if (rand() >= 0.5) {
14302             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
14303         }
14304         else {
14305             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
14306         }
14307     }
14308     return $new;
14309 }
14310
14311 # Used to make sure don't generate duplicate test cases.
14312 my %test_generated;
14313
14314 sub make_property_test_script() {
14315     # This used only for making the test script
14316     # this written directly -- it's huge.
14317
14318     print "Making test script\n" if $verbosity >= $PROGRESS;
14319
14320     # This uses randomness to test different possibilities without testing all
14321     # possibilities.  To ensure repeatability, set the seed to 0.  But if
14322     # tests are added, it will perturb all later ones in the .t file
14323     srand 0;
14324
14325     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
14326
14327     # Keep going down an order of magnitude
14328     # until find that adding this quantity to
14329     # 1 remains 1; but put an upper limit on
14330     # this so in case this algorithm doesn't
14331     # work properly on some platform, that we
14332     # won't loop forever.
14333     my $digits = 0;
14334     my $min_floating_slop = 1;
14335     while (1+ $min_floating_slop != 1
14336             && $digits++ < 50)
14337     {
14338         my $next = $min_floating_slop / 10;
14339         last if $next == 0; # If underflows,
14340                             # use previous one
14341         $min_floating_slop = $next;
14342     }
14343
14344     # It doesn't matter whether the elements of this array contain single lines
14345     # or multiple lines. main::write doesn't count the lines.
14346     my @output;
14347
14348     foreach my $property (property_ref('*')) {
14349         foreach my $table ($property->tables) {
14350
14351             # Find code points that match, and don't match this table.
14352             my $valid = $table->get_valid_code_point;
14353             my $invalid = $table->get_invalid_code_point;
14354             my $warning = ($table->status eq $DEPRECATED)
14355                             ? "'deprecated'"
14356                             : '""';
14357
14358             # Test each possible combination of the property's aliases with
14359             # the table's.  If this gets to be too many, could do what is done
14360             # in the set_final_comment() for Tables
14361             my @table_aliases = $table->aliases;
14362             my @property_aliases = $table->property->aliases;
14363
14364             # Every property can be optionally be prefixed by 'Is_', so test
14365             # that those work, by creating such a new alias for each
14366             # pre-existing one.
14367             push @property_aliases, map { Alias->new("Is_" . $_->name,
14368                                                     $_->loose_match,
14369                                                     $_->make_pod_entry,
14370                                                     $_->externally_ok,
14371                                                     $_->status)
14372                                          } @property_aliases;
14373             my $max = max(scalar @table_aliases, scalar @property_aliases);
14374             for my $j (0 .. $max - 1) {
14375
14376                 # The current alias for property is the next one on the list,
14377                 # or if beyond the end, start over.  Similarly for table
14378                 my $property_name
14379                             = $property_aliases[$j % @property_aliases]->name;
14380
14381                 $property_name = "" if $table->property == $perl;
14382                 my $table_alias = $table_aliases[$j % @table_aliases];
14383                 my $table_name = $table_alias->name;
14384                 my $loose_match = $table_alias->loose_match;
14385
14386                 # If the table doesn't have a file, any test for it is
14387                 # already guaranteed to be in error
14388                 my $already_error = ! $table->file_path;
14389
14390                 # Generate error cases for this alias.
14391                 push @output, generate_error($property_name,
14392                                              $table_name,
14393                                              $already_error);
14394
14395                 # If the table is guaranteed to always generate an error,
14396                 # quit now without generating success cases.
14397                 next if $already_error;
14398
14399                 # Now for the success cases.
14400                 my $random;
14401                 if ($loose_match) {
14402
14403                     # For loose matching, create an extra test case for the
14404                     # standard name.
14405                     my $standard = standardize($table_name);
14406
14407                     # $test_name should be a unique combination for each test
14408                     # case; used just to avoid duplicate tests
14409                     my $test_name = "$property_name=$standard";
14410
14411                     # Don't output duplicate test cases.
14412                     if (! exists $test_generated{$test_name}) {
14413                         $test_generated{$test_name} = 1;
14414                         push @output, generate_tests($property_name,
14415                                                      $standard,
14416                                                      $valid,
14417                                                      $invalid,
14418                                                      $warning,
14419                                                  );
14420                     }
14421                     $random = randomize_loose_name($table_name)
14422                 }
14423                 else { # Stricter match
14424                     $random = randomize_stricter_name($table_name);
14425                 }
14426
14427                 # Now for the main test case for this alias.
14428                 my $test_name = "$property_name=$random";
14429                 if (! exists $test_generated{$test_name}) {
14430                     $test_generated{$test_name} = 1;
14431                     push @output, generate_tests($property_name,
14432                                                  $random,
14433                                                  $valid,
14434                                                  $invalid,
14435                                                  $warning,
14436                                              );
14437
14438                     # If the name is a rational number, add tests for the
14439                     # floating point equivalent.
14440                     if ($table_name =~ qr{/}) {
14441
14442                         # Calculate the float, and find just the fraction.
14443                         my $float = eval $table_name;
14444                         my ($whole, $fraction)
14445                                             = $float =~ / (.*) \. (.*) /x;
14446
14447                         # Starting with one digit after the decimal point,
14448                         # create a test for each possible precision (number of
14449                         # digits past the decimal point) until well beyond the
14450                         # native number found on this machine.  (If we started
14451                         # with 0 digits, it would be an integer, which could
14452                         # well match an unrelated table)
14453                         PLACE:
14454                         for my $i (1 .. $min_floating_slop + 3) {
14455                             my $table_name = sprintf("%.*f", $i, $float);
14456                             if ($i < $MIN_FRACTION_LENGTH) {
14457
14458                                 # If the test case has fewer digits than the
14459                                 # minimum acceptable precision, it shouldn't
14460                                 # succeed, so we expect an error for it.
14461                                 # E.g., 2/3 = .7 at one decimal point, and we
14462                                 # shouldn't say it matches .7.  We should make
14463                                 # it be .667 at least before agreeing that the
14464                                 # intent was to match 2/3.  But at the
14465                                 # less-than- acceptable level of precision, it
14466                                 # might actually match an unrelated number.
14467                                 # So don't generate a test case if this
14468                                 # conflating is possible.  In our example, we
14469                                 # don't want 2/3 matching 7/10, if there is
14470                                 # a 7/10 code point.
14471                                 for my $existing
14472                                         (keys %nv_floating_to_rational)
14473                                 {
14474                                     next PLACE
14475                                         if abs($table_name - $existing)
14476                                                 < $MAX_FLOATING_SLOP;
14477                                 }
14478                                 push @output, generate_error($property_name,
14479                                                              $table_name,
14480                                                              1   # 1 => already an error
14481                                               );
14482                             }
14483                             else {
14484
14485                                 # Here the number of digits exceeds the
14486                                 # minimum we think is needed.  So generate a
14487                                 # success test case for it.
14488                                 push @output, generate_tests($property_name,
14489                                                              $table_name,
14490                                                              $valid,
14491                                                              $invalid,
14492                                                              $warning,
14493                                              );
14494                             }
14495                         }
14496                     }
14497                 }
14498             }
14499         }
14500     }
14501
14502     &write($t_path,
14503            0,           # Not utf8;
14504            [<DATA>,
14505             @output,
14506             (map {"Test_X('$_');\n"} @backslash_X_tests),
14507             "Finished();\n"]);
14508     return;
14509 }
14510
14511 # This is a list of the input files and how to handle them.  The files are
14512 # processed in their order in this list.  Some reordering is possible if
14513 # desired, but the v0 files should be first, and the extracted before the
14514 # others except DAge.txt (as data in an extracted file can be over-ridden by
14515 # the non-extracted.  Some other files depend on data derived from an earlier
14516 # file, like UnicodeData requires data from Jamo, and the case changing and
14517 # folding requires data from Unicode.  Mostly, it safest to order by first
14518 # version releases in (except the Jamo).  DAge.txt is read before the
14519 # extracted ones because of the rarely used feature $compare_versions.  In the
14520 # unlikely event that there were ever an extracted file that contained the Age
14521 # property information, it would have to go in front of DAge.
14522 #
14523 # The version strings allow the program to know whether to expect a file or
14524 # not, but if a file exists in the directory, it will be processed, even if it
14525 # is in a version earlier than expected, so you can copy files from a later
14526 # release into an earlier release's directory.
14527 my @input_file_objects = (
14528     Input_file->new('PropertyAliases.txt', v0,
14529                     Handler => \&process_PropertyAliases,
14530                     ),
14531     Input_file->new(undef, v0,  # No file associated with this
14532                     Progress_Message => 'Finishing property setup',
14533                     Handler => \&finish_property_setup,
14534                     ),
14535     Input_file->new('PropValueAliases.txt', v0,
14536                      Handler => \&process_PropValueAliases,
14537                      Has_Missings_Defaults => $NOT_IGNORED,
14538                      ),
14539     Input_file->new('DAge.txt', v3.2.0,
14540                     Has_Missings_Defaults => $NOT_IGNORED,
14541                     Property => 'Age'
14542                     ),
14543     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
14544                     Property => 'General_Category',
14545                     ),
14546     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
14547                     Property => 'Canonical_Combining_Class',
14548                     Has_Missings_Defaults => $NOT_IGNORED,
14549                     ),
14550     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
14551                     Property => 'Numeric_Type',
14552                     Has_Missings_Defaults => $NOT_IGNORED,
14553                     ),
14554     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
14555                     Property => 'East_Asian_Width',
14556                     Has_Missings_Defaults => $NOT_IGNORED,
14557                     ),
14558     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
14559                     Property => 'Line_Break',
14560                     Has_Missings_Defaults => $NOT_IGNORED,
14561                     ),
14562     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
14563                     Property => 'Bidi_Class',
14564                     Has_Missings_Defaults => $NOT_IGNORED,
14565                     ),
14566     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
14567                     Property => 'Decomposition_Type',
14568                     Has_Missings_Defaults => $NOT_IGNORED,
14569                     ),
14570     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
14571     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
14572                     Property => 'Numeric_Value',
14573                     Each_Line_Handler => \&filter_numeric_value_line,
14574                     Has_Missings_Defaults => $NOT_IGNORED,
14575                     ),
14576     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
14577                     Property => 'Joining_Group',
14578                     Has_Missings_Defaults => $NOT_IGNORED,
14579                     ),
14580
14581     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
14582                     Property => 'Joining_Type',
14583                     Has_Missings_Defaults => $NOT_IGNORED,
14584                     ),
14585     Input_file->new('Jamo.txt', v2.0.0,
14586                     Property => 'Jamo_Short_Name',
14587                     Each_Line_Handler => \&filter_jamo_line,
14588                     ),
14589     Input_file->new('UnicodeData.txt', v1.1.5,
14590                     Pre_Handler => \&setup_UnicodeData,
14591
14592                     # We clean up this file for some early versions.
14593                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
14594                                             ? \&filter_v1_ucd
14595                                             : ($v_version eq v2.1.5)
14596                                                 ? \&filter_v2_1_5_ucd
14597
14598                                                 # And for 5.14 Perls with 6.0,
14599                                                 # have to also make changes
14600                                                 : ($v_version ge v6.0.0)
14601                                                     ? \&filter_v6_ucd
14602                                                     : undef),
14603
14604                                             # And the main filter
14605                                             \&filter_UnicodeData_line,
14606                                          ],
14607                     EOF_Handler => \&EOF_UnicodeData,
14608                     ),
14609     Input_file->new('ArabicShaping.txt', v2.0.0,
14610                     Each_Line_Handler =>
14611                         [ ($v_version lt 4.1.0)
14612                                     ? \&filter_old_style_arabic_shaping
14613                                     : undef,
14614                         \&filter_arabic_shaping_line,
14615                         ],
14616                     Has_Missings_Defaults => $NOT_IGNORED,
14617                     ),
14618     Input_file->new('Blocks.txt', v2.0.0,
14619                     Property => 'Block',
14620                     Has_Missings_Defaults => $NOT_IGNORED,
14621                     Each_Line_Handler => \&filter_blocks_lines
14622                     ),
14623     Input_file->new('PropList.txt', v2.0.0,
14624                     Each_Line_Handler => (($v_version lt v3.1.0)
14625                                             ? \&filter_old_style_proplist
14626                                             : undef),
14627                     ),
14628     Input_file->new('Unihan.txt', v2.0.0,
14629                     Pre_Handler => \&setup_unihan,
14630                     Optional => 1,
14631                     Each_Line_Handler => \&filter_unihan_line,
14632                         ),
14633     Input_file->new('SpecialCasing.txt', v2.1.8,
14634                     Each_Line_Handler => \&filter_special_casing_line,
14635                     Pre_Handler => \&setup_special_casing,
14636                     ),
14637     Input_file->new(
14638                     'LineBreak.txt', v3.0.0,
14639                     Has_Missings_Defaults => $NOT_IGNORED,
14640                     Property => 'Line_Break',
14641                     # Early versions had problematic syntax
14642                     Each_Line_Handler => (($v_version lt v3.1.0)
14643                                         ? \&filter_early_ea_lb
14644                                         : undef),
14645                     ),
14646     Input_file->new('EastAsianWidth.txt', v3.0.0,
14647                     Property => 'East_Asian_Width',
14648                     Has_Missings_Defaults => $NOT_IGNORED,
14649                     # Early versions had problematic syntax
14650                     Each_Line_Handler => (($v_version lt v3.1.0)
14651                                         ? \&filter_early_ea_lb
14652                                         : undef),
14653                     ),
14654     Input_file->new('CompositionExclusions.txt', v3.0.0,
14655                     Property => 'Composition_Exclusion',
14656                     ),
14657     Input_file->new('BidiMirroring.txt', v3.0.1,
14658                     Property => 'Bidi_Mirroring_Glyph',
14659                     ),
14660     Input_file->new("NormalizationTest.txt", v3.0.1,
14661                     Skip => 1,
14662                     ),
14663     Input_file->new('CaseFolding.txt', v3.0.1,
14664                     Pre_Handler => \&setup_case_folding,
14665                     Each_Line_Handler =>
14666                         [ ($v_version lt v3.1.0)
14667                                  ? \&filter_old_style_case_folding
14668                                  : undef,
14669                            \&filter_case_folding_line
14670                         ],
14671                     ),
14672     Input_file->new('DCoreProperties.txt', v3.1.0,
14673                     # 5.2 changed this file
14674                     Has_Missings_Defaults => (($v_version ge v5.2.0)
14675                                             ? $NOT_IGNORED
14676                                             : $NO_DEFAULTS),
14677                     ),
14678     Input_file->new('Scripts.txt', v3.1.0,
14679                     Property => 'Script',
14680                     Has_Missings_Defaults => $NOT_IGNORED,
14681                     ),
14682     Input_file->new('DNormalizationProps.txt', v3.1.0,
14683                     Has_Missings_Defaults => $NOT_IGNORED,
14684                     Each_Line_Handler => (($v_version lt v4.0.1)
14685                                       ? \&filter_old_style_normalization_lines
14686                                       : undef),
14687                     ),
14688     Input_file->new('HangulSyllableType.txt', v4.0.0,
14689                     Has_Missings_Defaults => $NOT_IGNORED,
14690                     Property => 'Hangul_Syllable_Type'),
14691     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
14692                     Property => 'Word_Break',
14693                     Has_Missings_Defaults => $NOT_IGNORED,
14694                     ),
14695     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
14696                     Property => 'Grapheme_Cluster_Break',
14697                     Has_Missings_Defaults => $NOT_IGNORED,
14698                     ),
14699     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
14700                     Handler => \&process_GCB_test,
14701                     ),
14702     Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
14703                     Skip => 1,
14704                     ),
14705     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
14706                     Skip => 1,
14707                     ),
14708     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
14709                     Skip => 1,
14710                     ),
14711     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
14712                     Property => 'Sentence_Break',
14713                     Has_Missings_Defaults => $NOT_IGNORED,
14714                     ),
14715     Input_file->new('NamedSequences.txt', v4.1.0,
14716                     Handler => \&process_NamedSequences
14717                     ),
14718     Input_file->new('NameAliases.txt', v5.0.0,
14719                     Property => 'Name_Alias',
14720                     ),
14721     Input_file->new("BidiTest.txt", v5.2.0,
14722                     Skip => 1,
14723                     ),
14724     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
14725                     Optional => 1,
14726                     Each_Line_Handler => \&filter_unihan_line,
14727                     ),
14728     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
14729                     Optional => 1,
14730                     Each_Line_Handler => \&filter_unihan_line,
14731                     ),
14732     Input_file->new('UnihanIRGSources.txt', v5.2.0,
14733                     Optional => 1,
14734                     Pre_Handler => \&setup_unihan,
14735                     Each_Line_Handler => \&filter_unihan_line,
14736                     ),
14737     Input_file->new('UnihanNumericValues.txt', v5.2.0,
14738                     Optional => 1,
14739                     Each_Line_Handler => \&filter_unihan_line,
14740                     ),
14741     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
14742                     Optional => 1,
14743                     Each_Line_Handler => \&filter_unihan_line,
14744                     ),
14745     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
14746                     Optional => 1,
14747                     Each_Line_Handler => \&filter_unihan_line,
14748                     ),
14749     Input_file->new('UnihanReadings.txt', v5.2.0,
14750                     Optional => 1,
14751                     Each_Line_Handler => \&filter_unihan_line,
14752                     ),
14753     Input_file->new('UnihanVariants.txt', v5.2.0,
14754                     Optional => 1,
14755                     Each_Line_Handler => \&filter_unihan_line,
14756                     ),
14757     Input_file->new('ScriptExtensions.txt', v6.0.0,
14758                     Property => 'Script_Extensions',
14759                     Pre_Handler => \&setup_script_extensions,
14760                     ),
14761 );
14762
14763 # End of all the preliminaries.
14764 # Do it...
14765
14766 if ($compare_versions) {
14767     Carp::my_carp(<<END
14768 Warning.  \$compare_versions is set.  Output is not suitable for production
14769 END
14770     );
14771 }
14772
14773 # Put into %potential_files a list of all the files in the directory structure
14774 # that could be inputs to this program, excluding those that we should ignore.
14775 # Use absolute file names because it makes it easier across machine types.
14776 my @ignored_files_full_names = map { File::Spec->rel2abs(
14777                                      internal_file_to_platform($_))
14778                                 } keys %ignored_files;
14779 File::Find::find({
14780     wanted=>sub {
14781         return unless /\.txt$/i;  # Some platforms change the name's case
14782         my $full = lc(File::Spec->rel2abs($_));
14783         $potential_files{$full} = 1
14784                     if ! grep { $full eq lc($_) } @ignored_files_full_names;
14785         return;
14786     }
14787 }, File::Spec->curdir());
14788
14789 my @mktables_list_output_files;
14790 my $old_start_time = 0;
14791
14792 if (! -e $file_list) {
14793     print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
14794     $write_unchanged_files = 1;
14795 } elsif ($write_unchanged_files) {
14796     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
14797 }
14798 else {
14799     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
14800     my $file_handle;
14801     if (! open $file_handle, "<", $file_list) {
14802         Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
14803         $glob_list = 1;
14804     }
14805     else {
14806         my @input;
14807
14808         # Read and parse mktables.lst, placing the results from the first part
14809         # into @input, and the second part into @mktables_list_output_files
14810         for my $list ( \@input, \@mktables_list_output_files ) {
14811             while (<$file_handle>) {
14812                 s/^ \s+ | \s+ $//xg;
14813                 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
14814                     $old_start_time = $1;
14815                 }
14816                 next if /^ \s* (?: \# .* )? $/x;
14817                 last if /^ =+ $/x;
14818                 my ( $file ) = split /\t/;
14819                 push @$list, $file;
14820             }
14821             @$list = uniques(@$list);
14822             next;
14823         }
14824
14825         # Look through all the input files
14826         foreach my $input (@input) {
14827             next if $input eq 'version'; # Already have checked this.
14828
14829             # Ignore if doesn't exist.  The checking about whether we care or
14830             # not is done via the Input_file object.
14831             next if ! file_exists($input);
14832
14833             # The paths are stored with relative names, and with '/' as the
14834             # delimiter; convert to absolute on this machine
14835             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
14836             $potential_files{$full} = 1
14837                         if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
14838         }
14839     }
14840
14841     close $file_handle;
14842 }
14843
14844 if ($glob_list) {
14845
14846     # Here wants to process all .txt files in the directory structure.
14847     # Convert them to full path names.  They are stored in the platform's
14848     # relative style
14849     my @known_files;
14850     foreach my $object (@input_file_objects) {
14851         my $file = $object->file;
14852         next unless defined $file;
14853         push @known_files, File::Spec->rel2abs($file);
14854     }
14855
14856     my @unknown_input_files;
14857     foreach my $file (keys %potential_files) {
14858         next if grep { lc($file) eq lc($_) } @known_files;
14859
14860         # Here, the file is unknown to us.  Get relative path name
14861         $file = File::Spec->abs2rel($file);
14862         push @unknown_input_files, $file;
14863
14864         # What will happen is we create a data structure for it, and add it to
14865         # the list of input files to process.  First get the subdirectories
14866         # into an array
14867         my (undef, $directories, undef) = File::Spec->splitpath($file);
14868         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
14869         my @directories = File::Spec->splitdir($directories);
14870
14871         # If the file isn't extracted (meaning none of the directories is the
14872         # extracted one), just add it to the end of the list of inputs.
14873         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
14874             push @input_file_objects, Input_file->new($file, v0);
14875         }
14876         else {
14877
14878             # Here, the file is extracted.  It needs to go ahead of most other
14879             # processing.  Search for the first input file that isn't a
14880             # special required property (that is, find one whose first_release
14881             # is non-0), and isn't extracted.  Also, the Age property file is
14882             # processed before the extracted ones, just in case
14883             # $compare_versions is set.
14884             for (my $i = 0; $i < @input_file_objects; $i++) {
14885                 if ($input_file_objects[$i]->first_released ne v0
14886                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
14887                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
14888                 {
14889                     splice @input_file_objects, $i, 0,
14890                                                 Input_file->new($file, v0);
14891                     last;
14892                 }
14893             }
14894
14895         }
14896     }
14897     if (@unknown_input_files) {
14898         print STDERR simple_fold(join_lines(<<END
14899
14900 The following files are unknown as to how to handle.  Assuming they are
14901 typical property files.  You'll know by later error messages if it worked or
14902 not:
14903 END
14904         ) . " " . join(", ", @unknown_input_files) . "\n\n");
14905     }
14906 } # End of looking through directory structure for more .txt files.
14907
14908 # Create the list of input files from the objects we have defined, plus
14909 # version
14910 my @input_files = 'version';
14911 foreach my $object (@input_file_objects) {
14912     my $file = $object->file;
14913     next if ! defined $file;    # Not all objects have files
14914     next if $object->optional && ! -e $file;
14915     push @input_files,  $file;
14916 }
14917
14918 if ( $verbosity >= $VERBOSE ) {
14919     print "Expecting ".scalar( @input_files )." input files. ",
14920          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
14921 }
14922
14923 # We set $most_recent to be the most recently changed input file, including
14924 # this program itself (done much earlier in this file)
14925 foreach my $in (@input_files) {
14926     next unless -e $in;        # Keep going even if missing a file
14927     my $mod_time = (stat $in)[9];
14928     $most_recent = $mod_time if $mod_time > $most_recent;
14929
14930     # See that the input files have distinct names, to warn someone if they
14931     # are adding a new one
14932     if ($make_list) {
14933         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
14934         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
14935         my @directories = File::Spec->splitdir($directories);
14936         my $base = $file =~ s/\.txt$//;
14937         construct_filename($file, 'mutable', \@directories);
14938     }
14939 }
14940
14941 my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
14942               || ! scalar @mktables_list_output_files  # or if no outputs known
14943               || $old_start_time < $most_recent;       # or out-of-date
14944
14945 # Now we check to see if any output files are older than youngest, if
14946 # they are, we need to continue on, otherwise we can presumably bail.
14947 if (! $rebuild) {
14948     foreach my $out (@mktables_list_output_files) {
14949         if ( ! file_exists($out)) {
14950             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
14951             $rebuild = 1;
14952             last;
14953          }
14954         #local $to_trace = 1 if main::DEBUG;
14955         trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
14956         if ( (stat $out)[9] <= $most_recent ) {
14957             #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
14958             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
14959             $rebuild = 1;
14960             last;
14961         }
14962     }
14963 }
14964 if (! $rebuild) {
14965     print "Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
14966     exit(0);
14967 }
14968 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
14969
14970 # Ready to do the major processing.  First create the perl pseudo-property.
14971 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
14972
14973 # Process each input file
14974 foreach my $file (@input_file_objects) {
14975     $file->run;
14976 }
14977
14978 # Finish the table generation.
14979
14980 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
14981 finish_Unicode();
14982
14983 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
14984 compile_perl();
14985
14986 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
14987 add_perl_synonyms();
14988
14989 print "Writing tables\n" if $verbosity >= $PROGRESS;
14990 write_all_tables();
14991
14992 # Write mktables.lst
14993 if ( $file_list and $make_list ) {
14994
14995     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
14996     foreach my $file (@input_files, @files_actually_output) {
14997         my (undef, $directories, $file) = File::Spec->splitpath($file);
14998         my @directories = File::Spec->splitdir($directories);
14999         $file = join '/', @directories, $file;
15000     }
15001
15002     my $ofh;
15003     if (! open $ofh,">",$file_list) {
15004         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
15005         return
15006     }
15007     else {
15008         my $localtime = localtime $start_time;
15009         print $ofh <<"END";
15010 #
15011 # $file_list -- File list for $0.
15012 #
15013 #   Autogenerated starting on $start_time ($localtime)
15014 #
15015 # - First section is input files
15016 #   ($0 itself is not listed but is automatically considered an input)
15017 # - Section separator is /^=+\$/
15018 # - Second section is a list of output files.
15019 # - Lines matching /^\\s*#/ are treated as comments
15020 #   which along with blank lines are ignored.
15021 #
15022
15023 # Input files:
15024
15025 END
15026         print $ofh "$_\n" for sort(@input_files);
15027         print $ofh "\n=================================\n# Output files:\n\n";
15028         print $ofh "$_\n" for sort @files_actually_output;
15029         print $ofh "\n# ",scalar(@input_files)," input files\n",
15030                 "# ",scalar(@files_actually_output)+1," output files\n\n",
15031                 "# End list\n";
15032         close $ofh
15033             or Carp::my_carp("Failed to close $ofh: $!");
15034
15035         print "Filelist has ",scalar(@input_files)," input files and ",
15036             scalar(@files_actually_output)+1," output files\n"
15037             if $verbosity >= $VERBOSE;
15038     }
15039 }
15040
15041 # Output these warnings unless -q explicitly specified.
15042 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
15043     if (@unhandled_properties) {
15044         print "\nProperties and tables that unexpectedly have no code points\n";
15045         foreach my $property (sort @unhandled_properties) {
15046             print $property, "\n";
15047         }
15048     }
15049
15050     if (%potential_files) {
15051         print "\nInput files that are not considered:\n";
15052         foreach my $file (sort keys %potential_files) {
15053             print File::Spec->abs2rel($file), "\n";
15054         }
15055     }
15056     print "\nAll done\n" if $verbosity >= $VERBOSE;
15057 }
15058 exit(0);
15059
15060 # TRAILING CODE IS USED BY make_property_test_script()
15061 __DATA__
15062
15063 use strict;
15064 use warnings;
15065
15066 # If run outside the normal test suite on an ASCII platform, you can
15067 # just create a latin1_to_native() function that just returns its
15068 # inputs, because that's the only function used from test.pl
15069 require "test.pl";
15070
15071 # Test qr/\X/ and the \p{} regular expression constructs.  This file is
15072 # constructed by mktables from the tables it generates, so if mktables is
15073 # buggy, this won't necessarily catch those bugs.  Tests are generated for all
15074 # feasible properties; a few aren't currently feasible; see
15075 # is_code_point_usable() in mktables for details.
15076
15077 # Standard test packages are not used because this manipulates SIG_WARN.  It
15078 # exits 0 if every non-skipped test succeeded; -1 if any failed.
15079
15080 my $Tests = 0;
15081 my $Fails = 0;
15082
15083 sub Expect($$$$) {
15084     my $expected = shift;
15085     my $ord = shift;
15086     my $regex  = shift;
15087     my $warning_type = shift;   # Type of warning message, like 'deprecated'
15088                                 # or empty if none
15089     my $line   = (caller)[2];
15090     $ord = ord(latin1_to_native(chr($ord)));
15091
15092     # Convert the code point to hex form
15093     my $string = sprintf "\"\\x{%04X}\"", $ord;
15094
15095     my @tests = "";
15096
15097     # The first time through, use all warnings.  If the input should generate
15098     # a warning, add another time through with them turned off
15099     push @tests, "no warnings '$warning_type';" if $warning_type;
15100
15101     foreach my $no_warnings (@tests) {
15102
15103         # Store any warning messages instead of outputting them
15104         local $SIG{__WARN__} = $SIG{__WARN__};
15105         my $warning_message;
15106         $SIG{__WARN__} = sub { $warning_message = $_[0] };
15107
15108         $Tests++;
15109
15110         # A string eval is needed because of the 'no warnings'.
15111         # Assumes no parens in the regular expression
15112         my $result = eval "$no_warnings
15113                             my \$RegObj = qr($regex);
15114                             $string =~ \$RegObj ? 1 : 0";
15115         if (not defined $result) {
15116             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
15117             $Fails++;
15118         }
15119         elsif ($result ^ $expected) {
15120             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
15121             $Fails++;
15122         }
15123         elsif ($warning_message) {
15124             if (! $warning_type || ($warning_type && $no_warnings)) {
15125                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
15126                 $Fails++;
15127             }
15128             else {
15129                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
15130             }
15131         }
15132         elsif ($warning_type && ! $no_warnings) {
15133             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
15134             $Fails++;
15135         }
15136         else {
15137             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
15138         }
15139     }
15140     return;
15141 }
15142
15143 sub Error($) {
15144     my $regex  = shift;
15145     $Tests++;
15146     if (eval { 'x' =~ qr/$regex/; 1 }) {
15147         $Fails++;
15148         my $line = (caller)[2];
15149         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
15150     }
15151     else {
15152         my $line = (caller)[2];
15153         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
15154     }
15155     return;
15156 }
15157
15158 # GCBTest.txt character that separates grapheme clusters
15159 my $breakable_utf8 = my $breakable = chr(0xF7);
15160 utf8::upgrade($breakable_utf8);
15161
15162 # GCBTest.txt character that indicates that the adjoining code points are part
15163 # of the same grapheme cluster
15164 my $nobreak_utf8 = my $nobreak = chr(0xD7);
15165 utf8::upgrade($nobreak_utf8);
15166
15167 sub Test_X($) {
15168     # Test qr/\X/ matches.  The input is a line from auxiliary/GCBTest.txt
15169     # Each such line is a sequence of code points given by their hex numbers,
15170     # separated by the two characters defined just before this subroutine that
15171     # indicate that either there can or cannot be a break between the adjacent
15172     # code points.  If there isn't a break, that means the sequence forms an
15173     # extended grapheme cluster, which means that \X should match the whole
15174     # thing.  If there is a break, \X should stop there.  This is all
15175     # converted by this routine into a match:
15176     #   $string =~ /(\X)/,
15177     # Each \X should match the next cluster; and that is what is checked.
15178
15179     my $template = shift;
15180
15181     my $line   = (caller)[2];
15182
15183     # The line contains characters above the ASCII range, but in Latin1.  It
15184     # may or may not be in utf8, and if it is, it may or may not know it.  So,
15185     # convert these characters to 8 bits.  If knows is in utf8, simply
15186     # downgrade.
15187     if (utf8::is_utf8($template)) {
15188         utf8::downgrade($template);
15189     } else {
15190
15191         # Otherwise, if it is in utf8, but doesn't know it, the next lines
15192         # convert the two problematic characters to their 8-bit equivalents.
15193         # If it isn't in utf8, they don't harm anything.
15194         use bytes;
15195         $template =~ s/$nobreak_utf8/$nobreak/g;
15196         $template =~ s/$breakable_utf8/$breakable/g;
15197     }
15198
15199     # Get rid of the leading and trailing breakables
15200     $template =~ s/^ \s* $breakable \s* //x;
15201     $template =~ s/ \s* $breakable \s* $ //x;
15202
15203     # And no-breaks become just a space.
15204     $template =~ s/ \s* $nobreak \s* / /xg;
15205
15206     # Split the input into segments that are breakable between them.
15207     my @segments = split /\s*$breakable\s*/, $template;
15208
15209     my $string = "";
15210     my $display_string = "";
15211     my @should_match;
15212     my @should_display;
15213
15214     # Convert the code point sequence in each segment into a Perl string of
15215     # characters
15216     foreach my $segment (@segments) {
15217         my @code_points = split /\s+/, $segment;
15218         my $this_string = "";
15219         my $this_display = "";
15220         foreach my $code_point (@code_points) {
15221             $this_string .= latin1_to_native(chr(hex $code_point));
15222             $this_display .= "\\x{$code_point}";
15223         }
15224
15225         # The next cluster should match the string in this segment.
15226         push @should_match, $this_string;
15227         push @should_display, $this_display;
15228         $string .= $this_string;
15229         $display_string .= $this_display;
15230     }
15231
15232     # If a string can be represented in both non-ut8 and utf8, test both cases
15233     UPGRADE:
15234     for my $to_upgrade (0 .. 1) {
15235
15236         if ($to_upgrade) {
15237
15238             # If already in utf8, would just be a repeat
15239             next UPGRADE if utf8::is_utf8($string);
15240
15241             utf8::upgrade($string);
15242         }
15243
15244         # Finally, do the \X match.
15245         my @matches = $string =~ /(\X)/g;
15246
15247         # Look through each matched cluster to verify that it matches what we
15248         # expect.
15249         my $min = (@matches < @should_match) ? @matches : @should_match;
15250         for my $i (0 .. $min - 1) {
15251             $Tests++;
15252             if ($matches[$i] eq $should_match[$i]) {
15253                 print "ok $Tests - ";
15254                 if ($i == 0) {
15255                     print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
15256                 } else {
15257                     print "And \\X #", $i + 1,
15258                 }
15259                 print " correctly matched $should_display[$i]; line $line\n";
15260             } else {
15261                 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
15262                                                     unpack("U*", $matches[$i]));
15263                 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
15264                     $i + 1,
15265                     " should have matched $should_display[$i]",
15266                     " but instead matched $matches[$i]",
15267                     ".  Abandoning rest of line $line\n";
15268                 next UPGRADE;
15269             }
15270         }
15271
15272         # And the number of matches should equal the number of expected matches.
15273         $Tests++;
15274         if (@matches == @should_match) {
15275             print "ok $Tests - Nothing was left over; line $line\n";
15276         } else {
15277             print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
15278         }
15279     }
15280
15281     return;
15282 }
15283
15284 sub Finished() {
15285     print "1..$Tests\n";
15286     exit($Fails ? -1 : 0);
15287 }
15288
15289 Error('\p{Script=InGreek}');    # Bug #69018
15290 Test_X("1100 $nobreak 1161");  # Bug #70940
15291 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
15292 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
15293 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726